# Pattern Recognition 24H1
#### Runze Ji, Jiashuo Tian, Ziqian Liu

#### Import necessary Modules
* pandas
* scikit-learn
* itertools.islice
* tqdm

In [1]:
import os
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from itertools import islice
import csv

#### Specify the parameters used in training the classifier:
* Number of training files (TRAIN_FILES_COUNT)
* Epochs (EPOCHS)
* Path to train files (TRAIN_FILES_PATH)
* Path to Model (MODEL_PATH)

In [21]:
TRAIN_FILES_COUNT = 10000
TRAIN_FILES_OFFSET = 0
MEMORY_CLEANING = True

TRAIN_FILES_PATH = '../../PR/train'
MODEL_PATH = '../../PR/rfcModel.ptm'
LOGS_PATH = '../../PR/rfcEval.csv'

TRAIN_FILES = os.listdir(TRAIN_FILES_PATH)
TRAIN_FILES_END = TRAIN_FILES_COUNT + TRAIN_FILES_OFFSET

print(f'[init] Found {len(TRAIN_FILES)} Training Files\n')

[init] Found 18329 Training Files



#### Slice training files and encode labels
* All files containing datasets will be sliced in specified count, allowing separate training
* Encode 'type'
* Show Correspondence between label and encoded label

In [22]:
print(f'[init] Reading from Index-{TRAIN_FILES_OFFSET} to Index-{TRAIN_FILES_END-1}')
train_files_pb = tqdm(islice(TRAIN_FILES, TRAIN_FILES_OFFSET, TRAIN_FILES_END), '[preproc.loadCSV] Loading CSV Files...')

all_labels = []

for file in train_files_pb:
    file_path = os.path.join(TRAIN_FILES_PATH, file)
    data = pd.read_csv(file_path)
    all_labels.extend(data['type'].unique())

label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# 打印标签和对应的编码
for label, encoded_label in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"Label: {label} --> Encoded Label: {encoded_label}")

[init] Reading from Index-0 to Index-9999


[preproc.loadCSV] Loading CSV Files...: 10000it [01:06, 150.67it/s]

Label: 刺网 --> Encoded Label: 0
Label: 围网 --> Encoded Label: 1
Label: 拖网 --> Encoded Label: 2





#### Transforms dataframe and extend datatypes
* Analyzes dataframe and extract features
* Extend features

In [24]:
X_all = []
y_all = []

train_files_pb = tqdm(islice(TRAIN_FILES, TRAIN_FILES_COUNT),'[preproc.transform] Transforming Data...', TRAIN_FILES_COUNT)

for file in train_files_pb:
    file_path = os.path.join(TRAIN_FILES_PATH, file)
    data = pd.read_csv(file_path)

    # 转换时间列，提取特征等
    data['time'] = pd.to_datetime(data['time'])
    data['hour'] = data['time'].dt.hour
    data['day_of_week'] = data['time'].dt.dayofweek
    data['month'] = data['time'].dt.month

    # 使用转换后的标签
    data['type_encoded'] = label_encoder.transform(data['type'])

    X = data[['lat', 'lon', '速度', '方向', 'hour', 'day_of_week', 'month']]
    y = data['type_encoded']

    X_all.append(X)
    y_all.append(y)

# 将所有数据合并为一个大的 DataFrame
X = pd.concat(X_all, ignore_index=True)
y = pd.concat(y_all, ignore_index=True)

  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
  data['time'] = pd.to_datetime(data['time'])
[preproc.transform] Transforming Data...: 100%|██████████| 10000/10000 [02:24<00:00, 69.11it/s]


#### Splitting the Dataset into train set and test set

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
if MEMORY_CLEANING:
    X = 0
    X_all = 0
    y = 0
    y_all = 0

#### Initialize Random Forest Classifier
* Train Classifier

In [26]:
rf_classifier = RandomForestClassifier(n_estimators=80, random_state=42, verbose=2, n_jobs=8)
rf_classifier.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 80
building tree 2 of 80
building tree 3 of 80
building tree 4 of 80
building tree 5 of 80
building tree 6 of 80
building tree 7 of 80
building tree 8 of 80


KeyboardInterrupt: 

: 

#### Test prediction accuracy on test set

In [12]:
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:   45.5s


Accuracy: 0.9472605479610833


#### Model Saving

In [10]:
from joblib import dump, load
dump(rf_classifier, '../../PR/RandomForestClassifier.joblib')

['RandomForestClassifier.joblib']

#### Predict

In [13]:
TEST_DATASET_PATH = '../../PR/test_dataset'
TEST_FILES = os.listdir(TEST_DATASET_PATH)
print(f'[verify] Found {len(TEST_FILES)} Test Files\n')

'''
all_labels_test = []

for file in test_files_pb:
    file_path = os.path.join(TEST_DATASET_PATH, file)
    data = pd.read_csv(file_path)
    all_labels_test.extend(data['type'].unique())

label_encoder = LabelEncoder()
label_encoder.fit(all_labels_test)
'''

X_verify_all = []

verify_files_pb = tqdm(TEST_FILES,'[preproc.transform] Transforming Data...')

for file in verify_files_pb:
    file_path = os.path.join(TEST_DATASET_PATH, file)
    data = pd.read_csv(file_path)

    # 转换时间列，提取特征等
    data['time'] = pd.to_datetime(data['time'])
    data['hour'] = data['time'].dt.hour
    data['day_of_week'] = data['time'].dt.dayofweek
    data['month'] = data['time'].dt.month

    X_verify = data[['渔船ID', 'lat', 'lon', '速度', '方向', 'hour', 'day_of_week', 'month']]
    #X_verify = data[['lat', 'lon', '速度', '方向', 'hour', 'day_of_week', 'month']]

    X_verify_all.append(X_verify)

# 将所有数据合并为一个大的 DataFrame
X_verify = pd.concat(X_verify_all, ignore_index=True)
X_verify

[verify] Found 4034 Test Files



[preproc.transform] Transforming Data...: 100%|██████████| 4034/4034 [00:46<00:00, 87.57it/s] 


Unnamed: 0,渔船ID,lat,lon,速度,方向,hour,day_of_week,month
0,18330,30.685278,122.232972,8.0,22.0,7,5,10
1,18330,30.694222,122.241833,8.6,40.0,7,5,10
2,18330,30.702194,122.252056,8.6,44.0,7,5,10
3,18330,30.719944,122.269750,8.9,40.0,7,5,10
4,18330,30.728972,122.278306,9.4,42.0,7,5,10
...,...,...,...,...,...,...,...,...
6422980,22363,30.940000,123.557889,3.4,152.0,3,6,10
6422981,22363,30.935917,123.560111,3.2,156.0,3,6,10
6422982,22363,30.921333,123.573750,3.3,135.0,3,6,10
6422983,22363,30.918056,123.577806,3.6,130.0,3,6,10


In [None]:
from joblib import load
rf_classifier = load('../../PR/RandomForestClassifier.joblib')

In [15]:
y_test_pred = rf_classifier.predict(X_verify[['lat', 'lon', '速度', '方向', 'hour', 'day_of_week', 'month']])
y_test_pred

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:  1.1min


array([1, 1, 0, ..., 1, 1, 1])

In [16]:
X_verify['type'] = y_test_pred
#X_test.loc[X_test['prediction' == 0], 'prediction'] = '刺网'
X_duplicates = X_verify[['渔船ID', 'type']].replace({0:"刺网", 1:"围网", 2:"拖网"})

In [17]:
X_final = X_duplicates.drop_duplicates(subset='渔船ID', keep='first')
X_final

Unnamed: 0,渔船ID,type
0,18330,围网
1880,18331,刺网
2865,18332,围网
4429,18333,围网
6599,18334,拖网
...,...,...
6411160,22359,刺网
6412497,22360,围网
6415171,22361,拖网
6415964,22362,围网


In [18]:
X_final.to_csv('../../PR/submissions/rfc/submission_rfc.csv', index=False)