# Classification

In [7]:
# Loading all the modules that I will use in this notebook
from itertools import product
import cudf, cuml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from cuml.neighbors import KNeighborsRegressor as knnCM
from cuml.ensemble import RandomForestRegressor as rfCM

### 1. Load Data

In [8]:
# Loading the train, test and validation data(datasets) + looking the first 5 rows of the train dataset as confirmation that the data is loaded
train = pd.read_csv('../input/classification/classification/train/videos_emg_features_05_01_train.csv')
test = pd.read_csv('../input/classification/classification/test/videos_emg_features_05_01_test.csv')
validation = pd.read_csv('../input/classification/classification/validation/videos_emg_features_05_01_validation.csv')
train.head()

Unnamed: 0,Subject,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,...,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx,video,arousal,valence,class
0,Subject 26,0.000693,2e-06,-0.294017,0.193476,-0.557275,0.58587,0.294017,-0.221918,0.433606,...,2.467522e-15,6.193481e-13,175910.121778,175910.121778,1.136194,3.597469e-07,GroupA\01,4.0,5.0,neutral
1,Subject 26,0.000678,1e-06,-0.226482,0.115931,-0.460327,0.567215,0.226482,-0.129059,0.400917,...,2.24405e-15,5.632566e-13,178406.356796,178406.356796,1.13212,3.404466e-07,GroupA\01,4.0,5.0,neutral
2,Subject 26,0.000646,1e-06,-0.211141,0.078928,-0.399045,0.543397,0.211141,-0.08726,0.364224,...,2.276071e-15,5.712938e-13,195627.240519,195627.240519,1.143079,3.235088e-07,GroupA\01,4.0,5.0,neutral
3,Subject 26,0.000645,1e-06,-0.194372,0.068487,-0.39865,0.522556,0.194372,-0.075143,0.368694,...,2.158618e-15,5.418131e-13,195613.944335,195613.944335,1.141087,3.02586e-07,GroupA\01,4.0,5.0,neutral
4,Subject 26,0.000667,1e-06,-0.202488,0.095966,-0.396178,0.539132,0.202488,-0.105682,0.354309,...,2.189921e-15,5.496702e-13,186209.998678,186209.998678,1.135205,2.963975e-07,GroupA\01,4.0,5.0,neutral


### 2. Basic Data Exploration

In [None]:
# Basic info about the size(rows+colums) of the dataset, the memory usage and the datatypes of the data in the train dataset
train.info()

In [None]:
# Basic info about the size(rows+colums) of the dataset, the memory usage and the datatypes of the data in the validation dataset
validation.info()

In [None]:
# Basic info about the size(rows+colums) of the dataset, the memory usage and the datatypes of the data in the test dataset
test.info()

In [None]:
# The names of the columns(features) of the train dataset(basically they are same for the train, validation and test datasets)
print(train.columns.to_list())

In [None]:
# The datatypes of each feature
print(train.dtypes.to_list())

In [None]:
# All the features which datatype is object(string)
print(train.select_dtypes(include=['object']).columns.to_list())

In [None]:
# The shape of the train, validation, test datasets(number of rows + colums)
print(train.shape)
print(validation.shape)
print(test.shape)

In [None]:
# Some basic statistics about the train dataset(for every feature)
train.describe()

In [None]:
# Looking if there are any NaN/Null values in the train dataset
train.isna().sum().sum()

In [None]:
# Looking if there are any NaN/Null values in the validation dataset
validation.isna().sum().sum()

In [None]:
# Looking if there are any NaN/Null values in the test dataset
test.isna().sum().sum()

### 3. EDA, Visualization

In [None]:
# Getting the unique values of the valence column from the train dataset
valences=train["valence"]
unique_valences, num_examples_per_valence = np.unique(valences, return_counts = True)
unique_valences

In [None]:
# Getting the unique values of the class column from the train dataset
classes=train["class"]
unique_classes, num_examples_per_class = np.unique(classes, return_counts = True)
unique_classes

In [None]:
# Calculating the percentages of how many entries are negative, neutral and positive(from the class)
negative_perc=num_examples_per_class[0]/(num_examples_per_class.sum())*100
positive_perc=num_examples_per_class[2]/(num_examples_per_class.sum())*100
neutral_perc=num_examples_per_class[1]/(num_examples_per_class.sum())*100
neutral_perc

In [None]:
# Plotting the distribution of the valence per entries
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (15, 15))

# define the color palette which we will use
cmap = plt.get_cmap("tab20")
# sample as many colors from the palette as there are unique activites, so that each activity will be represented by a different color
sampled_colors = [cmap(i) for i in range (len(num_examples_per_valence))]

# use the pie chart function in the pyplot module to visualize the activity distributions
ax.pie(x = num_examples_per_valence, 
       colors = sampled_colors,
       labels = unique_valences, 
       wedgeprops= {'linewidth': 1, 
                    'edgecolor': 'black'}, 
       textprops = {'size': 'large', 
                    'family': "monospace",
                    'weight': 'medium'} 
);

In [None]:
# Plotting the distribution of classes per entry
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (15, 15))
# define the color palette which we will use
cmap = plt.get_cmap("tab20")
neg= "{:.2f}".format(negative_perc)
neut= "{:.2f}".format(neutral_perc)
pos = "{:.2f}".format(positive_perc)
my_labels=['','','']
my_labels[0]=unique_classes[0]+" "+str(neg)+"%"
my_labels[1]=unique_classes[1]+" "+str(neut)+"%"
my_labels[2]=unique_classes[2]+" "+str(pos)+"%"
# sample as many colors from the palette as there are unique activites, so that each activity will be represented by a different color
sampled_colors = [cmap(i) for i in range (len(num_examples_per_class))]
ax.pie(x = num_examples_per_class, 
       colors = sampled_colors,
       labels = my_labels, 
       wedgeprops= {'linewidth': 1, 
                    'edgecolor': 'black'}, 
       textprops = {'size': 'large', 
                    'family': "monospace",
                    'weight': 'medium'} 
);

### 4. Data Preprocessing + Data Split

In [9]:
# From the train dataset dropping all the columns(info) about the subject, video, arousal
train = train.drop(['Subject' ,'video', 'arousal'], axis = 1)
train.head()

Unnamed: 0,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,0_cc4,...,6_mdf,6_pkf,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx,valence,class
0,0.000693,2e-06,-0.294017,0.193476,-0.557275,0.58587,0.294017,-0.221918,0.433606,-0.966924,...,604.16,491.52,2.467522e-15,6.193481e-13,175910.121778,175910.121778,1.136194,3.597469e-07,5.0,neutral
1,0.000678,1e-06,-0.226482,0.115931,-0.460327,0.567215,0.226482,-0.129059,0.400917,-0.908324,...,634.88,583.68,2.24405e-15,5.632566e-13,178406.356796,178406.356796,1.13212,3.404466e-07,5.0,neutral
2,0.000646,1e-06,-0.211141,0.078928,-0.399045,0.543397,0.211141,-0.08726,0.364224,-0.840275,...,593.92,583.68,2.276071e-15,5.712938e-13,195627.240519,195627.240519,1.143079,3.235088e-07,5.0,neutral
3,0.000645,1e-06,-0.194372,0.068487,-0.39865,0.522556,0.194372,-0.075143,0.368694,-0.81155,...,655.36,583.68,2.158618e-15,5.418131e-13,195613.944335,195613.944335,1.141087,3.02586e-07,5.0,neutral
4,0.000667,1e-06,-0.202488,0.095966,-0.396178,0.539132,0.202488,-0.105682,0.354309,-0.825662,...,665.6,573.44,2.189921e-15,5.496702e-13,186209.998678,186209.998678,1.135205,2.963975e-07,5.0,neutral


In [10]:
# From the validation dataset dropping all the columns(info) about the subject, video, arousal
validation = validation.drop(['Subject' ,'video', 'arousal'], axis = 1)
validation.head()

Unnamed: 0,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,0_cc4,...,6_mdf,6_pkf,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx,valence,class
0,0.000441,1e-06,-0.334551,0.017524,-0.34862,0.210088,0.334551,-0.020455,0.341489,-0.317702,...,993.28,532.48,5.839758e-16,1.465779e-13,284079.951714,284079.951714,1.113174,1.632425e-07,8.0,positive
1,0.00047,1e-06,-0.345069,0.02206,-0.273772,0.259987,0.345069,-0.025867,0.266691,-0.363991,...,829.44,532.48,6.81312e-16,1.710093e-13,301391.185259,301391.185259,1.13744,1.735655e-07,8.0,positive
2,0.000466,1e-06,-0.358741,0.042098,-0.279966,0.276975,0.358741,-0.049649,0.266066,-0.387515,...,870.4,532.48,6.865532e-16,1.723249e-13,317636.011109,317636.011109,1.136466,1.828983e-07,8.0,positive
3,0.000442,1e-06,-0.267404,0.049737,-0.361819,0.248929,0.267404,-0.056387,0.341417,-0.376413,...,1003.52,542.72,6.021871e-16,1.51149e-13,318048.430724,318048.430724,1.122141,1.671134e-07,8.0,positive
4,0.000429,1e-06,-0.250008,0.032683,-0.33725,0.266309,0.250008,-0.036769,0.32485,-0.396075,...,1013.76,1280.0,6.032106e-16,1.514059e-13,300760.143538,300760.143538,1.11633,1.739748e-07,8.0,positive


In [11]:
# From the test dataset dropping all the columns(info) about the subject, video, arousal
test = test.drop(['Subject' ,'video', 'arousal'], axis = 1)
test.head()

Unnamed: 0,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,0_cc4,...,6_mdf,6_pkf,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx,valence,class
0,0.000477,1e-06,-0.282164,-0.073709,-0.465545,0.166727,0.282164,0.084108,0.504701,-0.292948,...,993.28,655.36,5.752697e-16,1.443927e-13,280006.02726,280006.02726,1.108416,1.69626e-07,4.0,neutral
1,0.000465,1e-06,-0.262964,-0.079181,-0.462096,0.194851,0.262964,0.089592,0.503496,-0.34201,...,1013.76,655.36,5.502947e-16,1.38124e-13,291634.642456,291634.642456,1.108709,1.731016e-07,4.0,neutral
2,0.000429,1e-06,-0.244149,-0.062111,-0.43696,0.172627,0.244149,0.069693,0.467413,-0.293659,...,1003.52,655.36,5.418429e-16,1.360026e-13,271807.048776,271807.048776,1.103043,1.606352e-07,4.0,neutral
3,0.000427,1e-06,-0.283302,-0.008829,-0.449354,0.224499,0.283302,0.01008,0.453883,-0.377344,...,1044.48,665.6,5.544107e-16,1.391571e-13,267030.478892,267030.478892,1.096577,1.659668e-07,4.0,neutral
4,0.000417,1e-06,-0.25408,-0.033362,-0.428612,0.231616,0.25408,0.0376,0.444728,-0.386125,...,1013.76,665.6,5.200007e-16,1.305202e-13,271600.712459,271600.712459,1.100119,1.642974e-07,4.0,neutral


In [12]:
# The train data is all the columns except the last 2 columns which are the target + class 
X_train = train.iloc[:,:-2]
X_train

Unnamed: 0,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,0_cc4,...,6_wl,6_mnf,6_mdf,6_pkf,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx
0,0.000693,0.000002,-0.294017,0.193476,-0.557275,0.585870,0.294017,-0.221918,0.433606,-0.966924,...,-4.712230e-06,777.582881,604.16,491.52,2.467522e-15,6.193481e-13,175910.121778,175910.121778,1.136194,3.597469e-07
1,0.000678,0.000001,-0.226482,0.115931,-0.460327,0.567215,0.226482,-0.129059,0.400917,-0.908324,...,-2.219070e-06,795.819529,634.88,583.68,2.244050e-15,5.632566e-13,178406.356796,178406.356796,1.132120,3.404466e-07
2,0.000646,0.000001,-0.211141,0.078928,-0.399045,0.543397,0.211141,-0.087260,0.364224,-0.840275,...,4.887581e-06,798.745054,593.92,583.68,2.276071e-15,5.712938e-13,195627.240519,195627.240519,1.143079,3.235088e-07
3,0.000645,0.000001,-0.194372,0.068487,-0.398650,0.522556,0.194372,-0.075143,0.368694,-0.811550,...,7.152557e-07,804.709039,655.36,583.68,2.158618e-15,5.418131e-13,195613.944335,195613.944335,1.141087,3.025860e-07
4,0.000667,0.000001,-0.202488,0.095966,-0.396178,0.539132,0.202488,-0.105682,0.354309,-0.825662,...,5.165736e-07,803.127950,665.60,573.44,2.189921e-15,5.496702e-13,186209.998678,186209.998678,1.135205,2.963975e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136895,0.000516,0.000001,-0.362316,0.014572,-0.336977,0.353298,0.362316,-0.017212,0.331177,-0.528804,...,-1.788162e-07,970.297467,870.40,532.48,1.159675e-15,2.910785e-13,236097.357271,236097.357271,1.118380,2.229397e-07
136896,0.000527,0.000001,-0.347752,0.014216,-0.346542,0.401977,0.347752,-0.016688,0.340758,-0.607442,...,-1.792161e-06,956.605358,819.20,624.64,1.128472e-15,2.832464e-13,251853.292594,251853.292594,1.129257,2.375530e-07
136897,0.000532,0.000001,-0.389285,0.031756,-0.374169,0.393635,0.389285,-0.037937,0.359974,-0.606184,...,-2.642475e-06,936.343407,788.48,532.48,1.201034e-15,3.014595e-13,250166.091238,250166.091238,1.133727,2.462810e-07
136898,0.000525,0.000001,-0.411930,0.052148,-0.392349,0.395622,0.411930,-0.062889,0.367674,-0.613813,...,-4.533975e-06,938.880738,737.28,450.56,1.143954e-15,2.871324e-13,270763.326829,270763.326829,1.143312,2.026677e-07


In [13]:
# The target for the train data is  before the last column
y_train = train.iloc[:,-2]
y_train

0         5.0
1         5.0
2         5.0
3         5.0
4         5.0
         ... 
136895    9.0
136896    9.0
136897    9.0
136898    9.0
136899    9.0
Name: valence, Length: 136900, dtype: float64

In [14]:
# The class for the train data is the last column
pred_train = train.iloc[:,-1]
pred_train

0          neutral
1          neutral
2          neutral
3          neutral
4          neutral
            ...   
136895    positive
136896    positive
136897    positive
136898    positive
136899    positive
Name: class, Length: 136900, dtype: object

In [15]:
# The validation data is all the columns except the last 2 columns which are the target + class 
X_validation = validation.iloc[:,:-2]
X_validation

Unnamed: 0,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,0_cc4,...,6_wl,6_mnf,6_mdf,6_pkf,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx
0,0.000441,0.000001,-0.334551,0.017524,-0.348620,0.210088,0.334551,-0.020455,0.341489,-0.317702,...,-2.519672e-06,1089.883827,993.28,532.48,5.839758e-16,1.465779e-13,284079.951714,284079.951714,1.113174,1.632425e-07
1,0.000470,0.000001,-0.345069,0.022060,-0.273772,0.259987,0.345069,-0.025867,0.266691,-0.363991,...,-2.524838e-06,1012.890525,829.44,532.48,6.813120e-16,1.710093e-13,301391.185259,301391.185259,1.137440,1.735655e-07
2,0.000466,0.000001,-0.358741,0.042098,-0.279966,0.276975,0.358741,-0.049649,0.266066,-0.387515,...,2.019187e-06,1043.770519,870.40,532.48,6.865532e-16,1.723249e-13,317636.011109,317636.011109,1.136466,1.828983e-07
3,0.000442,0.000001,-0.267404,0.049737,-0.361819,0.248929,0.267404,-0.056387,0.341417,-0.376413,...,2.324842e-06,1107.718475,1003.52,542.72,6.021871e-16,1.511490e-13,318048.430724,318048.430724,1.122141,1.671134e-07
4,0.000429,0.000001,-0.250008,0.032683,-0.337250,0.266309,0.250008,-0.036769,0.324850,-0.396075,...,9.845659e-08,1105.282820,1013.76,1280.00,6.032106e-16,1.514059e-13,300760.143538,300760.143538,1.116330,1.739748e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34017,0.000723,0.000002,-0.319766,0.026889,-0.560346,0.317735,0.319766,-0.031188,0.542870,-0.576469,...,-4.588549e-06,1067.403952,1013.76,798.72,1.587567e-15,3.984793e-13,230911.959153,230911.959153,1.096663,2.798654e-07
34018,0.000837,0.000002,-0.313002,0.042930,-0.481843,0.370989,0.313002,-0.049648,0.457921,-0.625814,...,1.723964e-07,1104.600618,1024.00,686.08,1.489861e-15,3.739552e-13,240956.652418,240956.652418,1.094295,2.749026e-07
34019,0.000908,0.000002,-0.148813,-0.103651,-0.242217,0.311284,0.148813,0.111363,0.269191,-0.436976,...,5.352461e-06,1115.519141,1013.76,849.92,1.428781e-15,3.586240e-13,230218.115760,230218.115760,1.088580,2.779582e-07
34020,0.000872,0.000002,-0.155239,-0.100510,-0.232381,0.321751,0.155239,0.108311,0.257551,-0.446052,...,8.625256e-07,1103.095271,1013.76,849.92,1.205931e-15,3.026887e-13,251503.807192,251503.807192,1.098494,2.616199e-07


In [16]:
# The target for the validation data is  before the last column
y_validation = validation.iloc[:,-2]
y_validation

0        8.0
1        8.0
2        8.0
3        8.0
4        8.0
        ... 
34017    4.0
34018    4.0
34019    4.0
34020    4.0
34021    4.0
Name: valence, Length: 34022, dtype: float64

In [17]:
# The class for the validation data is the last column
pred_validation = validation.iloc[:,-1]
pred_validation

0        positive
1        positive
2        positive
3        positive
4        positive
           ...   
34017     neutral
34018     neutral
34019     neutral
34020     neutral
34021     neutral
Name: class, Length: 34022, dtype: object

In [18]:
# The test data is all the columns except the last 2 columns which are the target + class 
X_test = test.iloc[:,:-2]
X_test

Unnamed: 0,0_iav,0_aac,0_ar1,0_ar2,0_ar3,0_ar4,0_cc1,0_cc2,0_cc3,0_cc4,...,6_wl,6_mnf,6_mdf,6_pkf,6_mnp,6_ttp,6_vcf,6_psr,6_ohm,6_maxx
0,0.000477,1.211073e-06,-0.282164,-0.073709,-0.465545,0.166727,0.282164,0.084108,0.504701,-0.292948,...,2.924024e-06,1106.773055,993.28,655.36,5.752697e-16,1.443927e-13,280006.027260,280006.027260,1.108416,1.696260e-07
1,0.000465,1.164340e-06,-0.262964,-0.079181,-0.462096,0.194851,0.262964,0.089592,0.503496,-0.342010,...,-6.643470e-07,1127.921677,1013.76,655.36,5.502947e-16,1.381240e-13,291634.642456,291634.642456,1.108709,1.731016e-07
2,0.000429,1.100598e-06,-0.244149,-0.062111,-0.436960,0.172627,0.244149,0.069693,0.467413,-0.293659,...,2.531475e-06,1119.944004,1003.52,655.36,5.418429e-16,1.360026e-13,271807.048776,271807.048776,1.103043,1.606352e-07
3,0.000427,1.070966e-06,-0.283302,-0.008829,-0.449354,0.224499,0.283302,0.010080,0.453883,-0.377344,...,2.197901e-06,1148.386357,1044.48,665.60,5.544107e-16,1.391571e-13,267030.478892,267030.478892,1.096577,1.659668e-07
4,0.000417,1.051405e-06,-0.254080,-0.033362,-0.428612,0.231616,0.254080,0.037600,0.444728,-0.386125,...,1.938682e-06,1136.543983,1013.76,665.60,5.200007e-16,1.305202e-13,271600.712459,271600.712459,1.100119,1.642974e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30786,0.000378,9.782695e-07,-0.200098,-0.094488,-0.394690,0.114856,0.200098,0.103941,0.435715,-0.189922,...,3.077239e-07,1109.602126,1003.52,512.00,3.801815e-16,9.542556e-14,293369.805483,293369.805483,1.112779,1.265193e-07
30787,0.000387,9.882117e-07,-0.176761,-0.052263,-0.418838,0.167116,0.176761,0.056882,0.442662,-0.278080,...,3.008856e-06,1095.440043,1003.52,512.00,4.053952e-16,1.017542e-13,297765.497411,297765.497411,1.117202,1.223218e-07
30788,0.000415,1.043870e-06,-0.173375,-0.044805,-0.490658,0.187141,0.173375,0.048689,0.514548,-0.331580,...,-3.388402e-07,1043.734078,911.36,522.24,4.317347e-16,1.083654e-13,284341.209554,284341.209554,1.122948,1.411798e-07
30789,0.000401,1.021297e-06,-0.189029,-0.056490,-0.484879,0.185424,0.189029,0.061829,0.514858,-0.328624,...,-1.996970e-06,1043.507844,911.36,675.84,4.404827e-16,1.105612e-13,300924.020520,300924.020520,1.129758,1.484544e-07


In [19]:
# The target for the test data is  before the last column
y_test = test.iloc[:,-2]
y_test

0        4.0
1        4.0
2        4.0
3        4.0
4        4.0
        ... 
30786    5.0
30787    5.0
30788    5.0
30789    5.0
30790    5.0
Name: valence, Length: 30791, dtype: float64

In [20]:
# The class for the test data is the last column
pred_test = test.iloc[:,-1]
pred_test

0        neutral
1        neutral
2        neutral
3        neutral
4        neutral
          ...   
30786    neutral
30787    neutral
30788    neutral
30789    neutral
30790    neutral
Name: class, Length: 30791, dtype: object

In [21]:
# Scaling all the data between -1 and 1 for the train, validation and test
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
X_test = scaler.transform(X_test)

### 5. Training first models

In [None]:
# Defining the basic models from which we are going to start all with basic paramethers
xgb = XGBRegressor(tree_method = 'gpu_hist')
rf = rfCM()
knn = knnCM()
dc = DecisionTreeRegressor()

In [22]:
# Defining a dictionary which will be used for mapping the results of the regression
dict = {1:'negative', 2:'negative', 3:'negative', 4: 'neutral', 5:'neutral', 6:'neutral',
            7:'positive', 8:'positive', 9:'positive'}

In [23]:
# Scoring the quasi classification of the model
def score_final(y_true, pred):
    pred = np.array(list(map(lambda x: dict[x], pred)))
    f1_micro = f1_score(y_true, pred,average='micro')
    f1_macro = f1_score(y_true, pred,average='macro')                       
    print(classification_report(y_true, pred))
    print("--------------------")
    print(f'The accuracy of the model is: {accuracy_score(y_true, pred)}')
    print("--------------------")
    print(f'The f1-micro score for the model is: {f1_micro}')
    print("--------------------")
    print(f'The f1-macro score for the model is: {f1_macro}')
    print("--------------------")
    print(confusion_matrix(y_true, pred))

In [24]:
# Scoring the regression model
def score_regression(y_validation, pred):
    print(f'Mean absolute error:  {mean_absolute_error(y_validation, pred)}')
    print("--------------------")
    print(f'Mean squared error: {mean_squared_error(y_validation, pred)}')
    print("--------------------")
    print(f'Root mean squared error: {np.sqrt(mean_squared_error(y_validation, pred))}')
    print("--------------------")
    print(f'R2 score: {r2_score(y_validation,pred)}')
    print("--------------------")

In [107]:
def XGBdata(pred_xgb):
    pred_xgb = np.floor(pred_xgb)
    pred_xgb[pred_xgb==11]=9
    pred_xgb[pred_xgb==12]=9
    pred_xgb[pred_xgb==10]=9
    pred_xgb[pred_xgb==0]=1
    pred_xgb[pred_xgb==-1]=1
    pred_xgb[pred_xgb==-2]=1
    pred_xgb[pred_xgb==-3]=1
    return pred_xgb

In [None]:
# Training the DecisionTreeRegressor model
dc.fit(X_train, y_train)

In [None]:
# Training the RandomForestRegressor model
rf.fit(X_train, y_train)

In [None]:
# Training the XGBRegressor model
xgb.fit(X_train, y_train)

In [50]:
# Training the KNNRegressor model
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [None]:
# Doing the predictions for the DecisionTree model and seeing the score how the model is performing
pred_dc = dc.predict(X_validation)
score_regression(y_validation,pred_dc)
score_final(pred_validation, pred_dc)

In [None]:
# Doing the predictions for the RandomForest model and seeing the score how the model is performing
pred_rf = rf.predict(X_validation)
score_regression(y_validation, pred_rf)
score_final(pred_validation, np.floor(pred_rf))

In [None]:
# Doing the predictions for the XGBoost model and seeing the score how the model is performing
pred_xgb = xgb.predict(X_validation)
score_regression(y_validation, pred_xgb)
score_final(pred_validation, XGBdata(pred_xgb))

In [None]:
# Doing the predictions for the KNN model and seeing the score how the model is performing
pred_knn = knn.predict(X_validation)
score_regression(y_validation, pred_knn)
score_final(pred_validation, np.floor(pred_knn))

### 6. Feature Extraction

In [26]:
# Doing feature selecting and getting the best 200 features and transforming the train and validation data to remove the bad features
fs1 = SelectKBest(f_regression, k=200)
X_train_fs1 = fs1.fit_transform(X_train, y_train)
X_validation_fs1 = fs1.transform(X_validation)

In [None]:
# Doing feature selecting and getting the best 100 features and transforming the train and validation data to remove the bad features
fs2 = SelectKBest(f_regression, k=100)
X_train_fs2 = fs2.fit_transform(X_train, y_train)
X_validation_fs2 = fs2.transform(X_validation)

In [None]:
# The shape of the new train, validation datasets(number of rows + colums)
print(X_train_fs1.shape)
print(X_validation_fs1.shape)
print(X_train_fs2.shape)
print(X_validation_fs2.shape)

In [None]:
# Training the DecisionTreeRegressor model with the new data with 200 features
dc.fit(X_train_fs1, y_train)

In [None]:
# Training the RandomForestRegressor model with the new data with 200 features
rf.fit(X_train_fs1, y_train)

In [None]:
# Training the XGBRegressor model with the new data with 200 features
xgb.fit(X_train_fs1, y_train)

In [None]:
# Training the KNN model with the new data with 200 features
knn.fit(X_train_fs1, y_train)

In [None]:
# Doing the predictions for the DecisionTree model and seeing the score how the model is performing with the data with 200 features
pred_dc_fs1 = dc.predict(X_validation_fs1)
score_regression(y_validation,pred_dc_fs1)
score_final(pred_validation, pred_dc_fs1)

In [None]:
# Doing the predictions for the RandomForest model and seeing the score how the model is performing with the data with 200 features
pred_rf_fs1 = rf.predict(X_validation_fs1)
score_regression(y_validation, pred_rf_fs1)
score_final(pred_validation, np.floor(pred_rf_fs1))

In [None]:
# Doing the predictions for the XGB model and seeing the score how the model is performing with the data with 200 features
pred_xgb_fs1 = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred_xgb_fs1)
score_final(pred_validation, XGBdata(pred_xgb_fs1))

In [None]:
# Doing the predictions for the KNN model and seeing the score how the model is performing with the data with 200 features
pred_knn_fs1 = knn.predict(X_validation_fs1)
score_regression(y_validation, pred_knn_fs1)
score_final(pred_validation, np.floor(pred_knn_fs1))

In [None]:
# Training the DecisionTreeRegressor model with the new data with 100 features
dc.fit(X_train_fs2, y_train)

In [None]:
# Training the RandomForestRegressor model with the new data with 100 features
rf.fit(X_train_fs2, y_train)

In [None]:
# Training the XGBRegressor model with the new data with 100 features
xgb.fit(X_train_fs2, y_train)

In [None]:
# Training the XGBRegressor model with the new data with 100 features
knn.fit(X_train_fs2, y_train)

In [None]:
# Doing the predictions for the DecisionTree model and seeing the score how the model is performing with the data with 100 features
pred_dc_fs2 = dc.predict(X_validation_fs2)
score_regression(y_validation,pred_dc_fs2)
score_final(pred_validation, pred_dc_fs2)

In [None]:
# Doing the predictions for the RandomForest model and seeing the score how the model is performing with the data with 100 features
pred_rf_fs2 = rf.predict(X_validation_fs2)
score_regression(y_validation, pred_rf_fs2)
score_final(pred_validation, np.floor(pred_rf_fs2))

In [None]:
# Doing the predictions for the XGBoost model and seeing the score how the model is performing with the data with 100 features
pred_xgb_fs2 = xgb.predict(X_validation_fs2)
score_regression(y_validation, pred_xgb_fs2)
score_final(pred_validation, XGBdata(pred_xgb))

In [None]:
# Doing the predictions for the KNN model and seeing the score how the model is performing with the data with 100 features
pred_knn_fs2 = knn.predict(X_validation_fs2)
score_regression(y_validation, pred_knn_fs2)
score_final(pred_validation, np.floor(pred_knn_fs2))

### 7. Hyperparameter optimization

### 7.1 Automated

In [51]:
# Function for training a model with certain parameters and testing that model on validation data + scoring the model and returs all the results into a dictionary
def hypOpt(regressor, param):
    dicts = {1:'negative', 2:'negative', 3:'negative', 4: 'neutral', 5:'neutral', 6:'neutral',
            7:'positive', 8:'positive', 9:'positive'}
    model = regressor(**param)
    model.fit(X_train_fs1, y_train)
    pred = model.predict(X_validation_fs1)
    pred = XGBdata(pred)
    pred = np.array(list(map(lambda x: dicts[x], pred)))
    f1_micro = f1_score(pred_validation, pred, average = 'micro')
    f1_macro = f1_score(pred_validation, pred, average = 'macro')
    print('Model finished !')  
    print("--------------------")
    dict = {}
    dict['model'] = model.__class__.__name__
    dict['parameters']= model.get_params()
    dict['f1_score_micro'] = f1_micro
    dict['f1_score_macro'] = f1_macro
    dict['predictions'] = pred
    return dict

In [52]:
# Function that iterates through the different combinations of parameters for a particular model(classificator) + puts the result into a list
def listResult(classificator, parameters):
    results = list()
    for para in parameters:
        result = hypOpt(classificator, para)
        results.append(result)
    return results
    

In [53]:
# Function that creates a different combination of parameters for RandomForestClassifier(list of parameters)
def paramRF():
    max_depth = [32, 64, 100]
    split_criterion = [0, 1]
    n_estimators = [500, 1000, 1500, 2000]
    n_bins = [256, 512]
    parameters = list()
    for max_depth, split_criterion, n_estimators, n_bins in product(max_depth, split_criterion, n_estimators, n_bins):
        param = {'max_depth': max_depth, 'split_criterion': split_criterion, 'n_estimators': n_estimators, 'n_bins': n_bins}
        parameters.append(param)
    return parameters

In [54]:
# Function that creates a different combination of parameters for KNN(list of parameters)
# 370 is a number that comes from one formula which is n = sqrt(number_of_entries_in_the_data)
# cuML allows only changing the n_neighbors parameter
def paramKNN():
    n_neighbors = [50,100,200,300,370,400,500]
    parameters = list()
    for n_neighbor in n_neighbors:
        param = {'n_neighbors': n_neighbor}
        parameters.append(param)
    return parameters

In [None]:
# Function that creates a different combination of parameters for XGB(list of parameters)
def paramXGB():
    verbosity = [0]
    tree_method = ['gpu_hist']
    max_depth = [3, 6, 7, None]
    n_estimators = [500, 1000, 1500, 2000]
    booster=['gbtree', 'dart']
    parameters = list()
    for verbosity,tree_method, max_depth, n_estimators, booster in product(verbosity, tree_method, max_depth, n_estimators, booster):
        param = {'verbosity': verbosity, 'tree_method': tree_method, 'max_depth': max_depth, 'n_estimators': n_estimators, 'booster': booster}
        parameters.append(param)
    return parameters

In [None]:
# Training + predicting + scoring all the RandomForest models(NOT ENOUGH RAM)
parameters = paramRF()
print("--------------------")
print('There are '+ str(len(parameters)) + ' combination of different parameters !')
print("--------------------")
results_rf = listResult(rfCM, parameters)

In [None]:
df_rf = pd.DataFrame(results_rf)
df_rf.head(10)
df_rf.to_csv('./results_rf.csv')

In [None]:
# Training + predicting + scoring all the XGB models(NOT ENOUGH RAM)
parameters = paramXGB()
print("--------------------")
print('There are '+ str(len(parameters)) + ' combination of different parameters !')
print("--------------------")
results_xgb = listResult(XGBClassifier, parameters)

In [None]:
df_xgb = pd.DataFrame(results_xgb)
df_xgb.head(10)
df_xgb.to_csv('./results_xgb.csv')

In [55]:
# Training + predicting + scoring all the KNN models
parameters = paramKNN()
print("--------------------")
print('There are '+ str(len(parameters)) + ' combination of different parameters !')
print("--------------------")
results_knn = listResult(knnCM, parameters)

--------------------
There are 7 combination of different parameters !
--------------------
Model finished !
--------------------
Model finished !
--------------------
Model finished !
--------------------
Model finished !
--------------------
Model finished !
--------------------
Model finished !
--------------------
Model finished !
--------------------


In [56]:
# Creating dataframe of all the results
df_knn = pd.DataFrame(results_knn)
df_knn.head(10)

Unnamed: 0,model,parameters,f1_score_micro,f1_score_macro,predictions
0,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.63406,0.467582,"[neutral, neutral, neutral, neutral, neutral, ..."
1,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.656928,0.481361,"[neutral, neutral, neutral, neutral, neutral, ..."
2,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.677179,0.491344,"[neutral, neutral, neutral, neutral, neutral, ..."
3,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.687849,0.495527,"[neutral, neutral, neutral, neutral, neutral, ..."
4,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.691229,0.494707,"[neutral, neutral, neutral, neutral, neutral, ..."
5,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.692052,0.493457,"[neutral, neutral, neutral, neutral, neutral, ..."
6,KNeighborsRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.695462,0.491624,"[neutral, neutral, neutral, neutral, neutral, ..."


In [57]:
# Saving the dataframe to output
df_knn.to_csv('./results_knn.csv')

### 7.2 Manual

### FROM HERE STARTS XGBOOST

In [58]:
results = list()

In [59]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 500, booster = 'gbtree')
xgb.fit(X_train_fs1,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [60]:
pred = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.6636364179940988
--------------------
Mean squared error: 4.608460017689777
--------------------
Root mean squared error: 2.1467324047700442
--------------------
R2 score: -0.3742859887780554
--------------------
              precision    recall  f1-score   support

    negative       0.21      0.36      0.27      5552
     neutral       0.70      0.67      0.68     23230
    positive       0.31      0.15      0.21      5240

    accuracy                           0.54     34022
   macro avg       0.41      0.39      0.39     34022
weighted avg       0.56      0.54      0.54     34022

--------------------
The accuracy of the model is: 0.5377696784433602
--------------------
The f1-micro score for the model is: 0.5377696784433602
--------------------
The f1-macro score for the model is: 0.38597687855095814
--------------------
[[ 1986  3272   294]
 [ 6250 15500  1480]
 [ 1019  3411   810]]


In [61]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = xgb.__class__.__name__
dicts['parameters']= xgb.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [62]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 1000, booster = 'gbtree')
xgb.fit(X_train_fs1,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [63]:
pred = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.6854806612112974
--------------------
Mean squared error: 4.736915635873007
--------------------
Root mean squared error: 2.1764456427563283
--------------------
R2 score: -0.4125926585921367
--------------------
              precision    recall  f1-score   support

    negative       0.21      0.35      0.26      5552
     neutral       0.70      0.66      0.68     23230
    positive       0.30      0.16      0.21      5240

    accuracy                           0.53     34022
   macro avg       0.40      0.39      0.38     34022
weighted avg       0.56      0.53      0.54     34022

--------------------
The accuracy of the model is: 0.5328904826288872
--------------------
The f1-micro score for the model is: 0.5328904826288872
--------------------
The f1-macro score for the model is: 0.3834667481079214
--------------------
[[ 1970  3256   326]
 [ 6338 15336  1556]
 [ 1056  3360   824]]


In [64]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = xgb.__class__.__name__
dicts['parameters']= xgb.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [65]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 500, booster = 'dart')
xgb.fit(X_train_fs1,y_train)

XGBRegressor(base_score=0.5, booster='dart', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [66]:
pred = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.6636363726119558
--------------------
Mean squared error: 4.608460009817854
--------------------
Root mean squared error: 2.146732402936578
--------------------
R2 score: -0.3742859864305741
--------------------
              precision    recall  f1-score   support

    negative       0.21      0.36      0.27      5552
     neutral       0.70      0.67      0.68     23230
    positive       0.31      0.15      0.21      5240

    accuracy                           0.54     34022
   macro avg       0.41      0.39      0.39     34022
weighted avg       0.56      0.54      0.54     34022

--------------------
The accuracy of the model is: 0.5377696784433602
--------------------
The f1-micro score for the model is: 0.5377696784433602
--------------------
The f1-macro score for the model is: 0.38597687855095814
--------------------
[[ 1986  3272   294]
 [ 6250 15500  1480]
 [ 1019  3411   810]]


In [67]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = xgb.__class__.__name__
dicts['parameters']= xgb.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [68]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 500, booster = 'dart', max_depth=5)
xgb.fit(X_train_fs1,y_train)

XGBRegressor(base_score=0.5, booster='dart', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [69]:
pred = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.64744348987831
--------------------
Mean squared error: 4.654795750952458
--------------------
Root mean squared error: 2.1574975668474017
--------------------
R2 score: -0.38810373890678074
--------------------
              precision    recall  f1-score   support

    negative       0.19      0.29      0.23      5552
     neutral       0.69      0.70      0.70     23230
    positive       0.35      0.13      0.19      5240

    accuracy                           0.55     34022
   macro avg       0.41      0.38      0.37     34022
weighted avg       0.56      0.55      0.54     34022

--------------------
The accuracy of the model is: 0.546411145729234
--------------------
The f1-micro score for the model is: 0.546411145729234
--------------------
The f1-macro score for the model is: 0.37356705576025356
--------------------
[[ 1618  3609   325]
 [ 5975 16267   988]
 [  912  3623   705]]


In [70]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = xgb.__class__.__name__
dicts['parameters']= xgb.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [71]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 500, booster = 'dart', max_depth=7)
xgb.fit(X_train_fs1,y_train)

XGBRegressor(base_score=0.5, booster='dart', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [72]:
pred = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.6320782261526947
--------------------
Mean squared error: 4.471499561727469
--------------------
Root mean squared error: 2.1145920556285716
--------------------
R2 score: -0.3334430965921291
--------------------
              precision    recall  f1-score   support

    negative       0.20      0.33      0.25      5552
     neutral       0.70      0.68      0.69     23230
    positive       0.33      0.15      0.21      5240

    accuracy                           0.54     34022
   macro avg       0.41      0.39      0.38     34022
weighted avg       0.56      0.54      0.54     34022

--------------------
The accuracy of the model is: 0.5395332431955794
--------------------
The f1-micro score for the model is: 0.5395332431955794
--------------------
The f1-macro score for the model is: 0.38247562169073174
--------------------
[[ 1845  3463   244]
 [ 6172 15715  1343]
 [ 1068  3376   796]]


In [73]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = xgb.__class__.__name__
dicts['parameters']= xgb.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [74]:
df_xgb = pd.DataFrame(results)
df_xgb.head()

Unnamed: 0,model,parameters,f1_score_micro,f1_score_macro
0,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53777,0.385977
1,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53289,0.383467
2,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53777,0.385977
3,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.546411,0.373567
4,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.539533,0.382476


In [75]:
df_xgb.to_csv('./results_xgb.csv')

### FROM HERE STARTS RANDOMFOREST

In [27]:
results = list()

In [28]:
rf = rfCM(n_estimators=500)
rf.fit(X_train_fs1,y_train)

  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [29]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.364626134009395
--------------------
Mean squared error: 3.354523705474654
--------------------
Root mean squared error: 1.8315358870288767
--------------------
R2 score: -0.00035042286613862217
--------------------
              precision    recall  f1-score   support

    negative       0.31      0.34      0.32      5552
     neutral       0.72      0.83      0.77     23230
    positive       0.63      0.13      0.22      5240

    accuracy                           0.64     34022
   macro avg       0.55      0.43      0.44     34022
weighted avg       0.64      0.64      0.61     34022

--------------------
The accuracy of the model is: 0.6429369231673623
--------------------
The f1-micro score for the model is: 0.6429369231673623
--------------------
The f1-macro score for the model is: 0.43674543165317486
--------------------
[[ 1861  3567   124]
 [ 3623 19319   288]
 [  613  3933   694]]


In [30]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = rf.__class__.__name__
dicts['parameters']= rf.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [31]:
rf = rfCM(n_estimators=1000)
rf.fit(X_train_fs1,y_train)

  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [32]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.3630839730123498
--------------------
Mean squared error: 3.3461900500305486
--------------------
Root mean squared error: 1.8292594266616609
--------------------
R2 score: 0.002134751328315976
--------------------
              precision    recall  f1-score   support

    negative       0.31      0.34      0.32      5552
     neutral       0.72      0.83      0.77     23230
    positive       0.63      0.13      0.22      5240

    accuracy                           0.64     34022
   macro avg       0.55      0.43      0.44     34022
weighted avg       0.64      0.64      0.61     34022

--------------------
The accuracy of the model is: 0.6435835635765094
--------------------
The f1-micro score for the model is: 0.6435835635765094
--------------------
The f1-macro score for the model is: 0.4366529000391852
--------------------
[[ 1864  3563   125]
 [ 3603 19346   281]
 [  598  3956   686]]


In [33]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = rf.__class__.__name__
dicts['parameters']= rf.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [34]:
rf = rfCM(n_estimators=500, n_bins = 256, split_criterion= 2)
rf.fit(X_train_fs1,y_train)

  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [35]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.364870655777724
--------------------
Mean squared error: 3.3538154528248496
--------------------
Root mean squared error: 1.831342527443965
--------------------
R2 score: -0.00013921528499327707
--------------------
              precision    recall  f1-score   support

    negative       0.32      0.34      0.33      5552
     neutral       0.72      0.84      0.77     23230
    positive       0.62      0.12      0.20      5240

    accuracy                           0.65     34022
   macro avg       0.55      0.43      0.44     34022
weighted avg       0.64      0.65      0.61     34022

--------------------
The accuracy of the model is: 0.645582270295691
--------------------
The f1-micro score for the model is: 0.645582270295691
--------------------
The f1-macro score for the model is: 0.43533493907003656
--------------------
[[ 1892  3538   122]
 [ 3521 19430   279]
 [  578  4020   642]]


In [36]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = rf.__class__.__name__
dicts['parameters']= rf.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [37]:
rf = rfCM(n_estimators=500, n_bins = 256, split_criterion= 4)
rf.fit(X_train_fs1,y_train)

  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [38]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.3834006924766187
--------------------
Mean squared error: 3.407883332912597
--------------------
Root mean squared error: 1.8460453225510465
--------------------
R2 score: -0.016262764097894067
--------------------
              precision    recall  f1-score   support

    negative       0.30      0.38      0.33      5552
     neutral       0.72      0.80      0.76     23230
    positive       0.64      0.11      0.19      5240

    accuracy                           0.63     34022
   macro avg       0.55      0.43      0.43     34022
weighted avg       0.64      0.63      0.60     34022

--------------------
The accuracy of the model is: 0.6289459761330903
--------------------
The f1-micro score for the model is: 0.6289459761330903
--------------------
The f1-macro score for the model is: 0.42964272803747167
--------------------
[[ 2136  3315   101]
 [ 4331 18663   236]
 [  734  3907   599]]


In [39]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = rf.__class__.__name__
dicts['parameters']= rf.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [40]:
rf = rfCM(n_estimators=500, n_bins = 256, split_criterion= 5)
rf.fit(X_train_fs1,y_train)

  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [41]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.364870655777724
--------------------
Mean squared error: 3.3538154528248496
--------------------
Root mean squared error: 1.831342527443965
--------------------
R2 score: -0.00013921528499327707
--------------------
              precision    recall  f1-score   support

    negative       0.32      0.34      0.33      5552
     neutral       0.72      0.84      0.77     23230
    positive       0.62      0.12      0.20      5240

    accuracy                           0.65     34022
   macro avg       0.55      0.43      0.44     34022
weighted avg       0.64      0.65      0.61     34022

--------------------
The accuracy of the model is: 0.645582270295691
--------------------
The f1-micro score for the model is: 0.645582270295691
--------------------
The f1-macro score for the model is: 0.43533493907003656
--------------------
[[ 1892  3538   122]
 [ 3521 19430   279]
 [  578  4020   642]]


In [42]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = rf.__class__.__name__
dicts['parameters']= rf.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [43]:
rf = rfCM(n_estimators=500, n_bins = 256, split_criterion= 6)
rf.fit(X_train_fs1,y_train)

  return func(**kwargs)
  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [44]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.364870655777724
--------------------
Mean squared error: 3.3538154528248496
--------------------
Root mean squared error: 1.831342527443965
--------------------
R2 score: -0.00013921528499327707
--------------------
              precision    recall  f1-score   support

    negative       0.32      0.34      0.33      5552
     neutral       0.72      0.84      0.77     23230
    positive       0.62      0.12      0.20      5240

    accuracy                           0.65     34022
   macro avg       0.55      0.43      0.44     34022
weighted avg       0.64      0.65      0.61     34022

--------------------
The accuracy of the model is: 0.645582270295691
--------------------
The f1-micro score for the model is: 0.645582270295691
--------------------
The f1-macro score for the model is: 0.43533493907003656
--------------------
[[ 1892  3538   122]
 [ 3521 19430   279]
 [  578  4020   642]]


In [45]:
dicts = {}
pred = XGBdata(pred)
pred = np.array(list(map(lambda x: dict[x], pred)))
dicts['model'] = rf.__class__.__name__
dicts['parameters']= rf.get_params()
dicts['f1_score_micro'] = f1_score(pred_validation, pred, average='micro')
dicts['f1_score_macro'] = f1_score(pred_validation, pred, average='macro')
results.append(dicts)

In [46]:
df_rf = pd.DataFrame(results)
df_rf.head()

Unnamed: 0,model,parameters,f1_score_micro,f1_score_macro
0,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.642937,0.436745
1,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.643584,0.436653
2,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335
3,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.628946,0.429643
4,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335


In [47]:
df_rf.to_csv('./results_rf.csv')

### 8. Final models + results

In [76]:
df = pd.concat([df_rf,df_xgb,df_knn],  ignore_index=True)
df.head(16)

Unnamed: 0,model,parameters,f1_score_micro,f1_score_macro,predictions
0,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.642937,0.436745,
1,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.643584,0.436653,
2,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335,
3,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.628946,0.429643,
4,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335,
5,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335,
6,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53777,0.385977,
7,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53289,0.383467,
8,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53777,0.385977,
9,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.546411,0.373567,


In [79]:
df.drop('predictions', axis = 1)

Unnamed: 0,model,parameters,f1_score_micro,f1_score_macro
0,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.642937,0.436745
1,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.643584,0.436653
2,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335
3,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.628946,0.429643
4,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335
5,RandomForestRegressor,{'handle': <cuml.raft.common.handle.Handle obj...,0.645582,0.435335
6,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53777,0.385977
7,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53289,0.383467
8,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.53777,0.385977
9,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.546411,0.373567


In [80]:
df.to_csv('./results.csv')

In [78]:
X_test_fs1 = fs1.transform(X_test)

In [95]:
rf = rfCM(n_estimators=500, n_bins = 256, split_criterion= 6)
rf.fit(X_train_fs1,y_train)

  return func(**kwargs)
  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [96]:
pred = rf.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.364870655777724
--------------------
Mean squared error: 3.3538154528248496
--------------------
Root mean squared error: 1.831342527443965
--------------------
R2 score: -0.00013921528499327707
--------------------
              precision    recall  f1-score   support

    negative       0.32      0.34      0.33      5552
     neutral       0.72      0.84      0.77     23230
    positive       0.62      0.12      0.20      5240

    accuracy                           0.65     34022
   macro avg       0.55      0.43      0.44     34022
weighted avg       0.64      0.65      0.61     34022

--------------------
The accuracy of the model is: 0.645582270295691
--------------------
The f1-micro score for the model is: 0.645582270295691
--------------------
The f1-macro score for the model is: 0.43533493907003656
--------------------
[[ 1892  3538   122]
 [ 3521 19430   279]
 [  578  4020   642]]


In [97]:
pred = rf.predict(X_test_fs1)
score_regression(y_test, pred)
score_final(pred_test, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.3888266179708646
--------------------
Mean squared error: 3.067463946671064
--------------------
Root mean squared error: 1.7514176962309886
--------------------
R2 score: -0.040294065624775444
--------------------
              precision    recall  f1-score   support

    negative       0.23      0.18      0.20      7117
     neutral       0.62      0.77      0.69     19330
    positive       0.66      0.20      0.31      4344

    accuracy                           0.55     30791
   macro avg       0.50      0.38      0.40     30791
weighted avg       0.54      0.55      0.52     30791

--------------------
The accuracy of the model is: 0.5530512162644928
--------------------
The f1-micro score for the model is: 0.5530512162644928
--------------------
The f1-macro score for the model is: 0.4004794159185099
--------------------
[[ 1292  5722   103]
 [ 4122 14856   352]
 [   95  3368   881]]


In [98]:
rf = rfCM(n_estimators=500, n_bins = 256, split_criterion= 6)
rf.fit(X_train,y_train)

  return func(**kwargs)
  ret_val = func(*args, **kwargs)


RandomForestRegressor()

In [99]:
pred = rf.predict(X_validation)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.3626912114346896
--------------------
Mean squared error: 3.3382174657363755
--------------------
Root mean squared error: 1.8270789434877672
--------------------
R2 score: 0.004512250720255295
--------------------
              precision    recall  f1-score   support

    negative       0.31      0.35      0.33      5552
     neutral       0.72      0.83      0.77     23230
    positive       0.63      0.13      0.22      5240

    accuracy                           0.64     34022
   macro avg       0.55      0.44      0.44     34022
weighted avg       0.64      0.64      0.61     34022

--------------------
The accuracy of the model is: 0.642260890012345
--------------------
The f1-micro score for the model is: 0.642260890012345
--------------------
The f1-macro score for the model is: 0.43829439980026724
--------------------
[[ 1923  3497   132]
 [ 3712 19237   281]
 [  606  3943   691]]


In [100]:
pred = rf.predict(X_test)
score_regression(y_test, pred)
score_final(pred_test, XGBdata(pred))

Defaulting to CPU-based Prediction. 
To predict on float-64 data, set parameter predict_model = 'CPU'
  ret_val = func(*args, **kwargs)


Mean absolute error:  1.3916894665248614
--------------------
Mean squared error: 3.0774415113911515
--------------------
Root mean squared error: 1.754263808949826
--------------------
R2 score: -0.04367783852256557
--------------------
              precision    recall  f1-score   support

    negative       0.22      0.18      0.20      7117
     neutral       0.62      0.76      0.68     19330
    positive       0.68      0.22      0.33      4344

    accuracy                           0.55     30791
   macro avg       0.51      0.39      0.41     30791
weighted avg       0.54      0.55      0.52     30791

--------------------
The accuracy of the model is: 0.5479848007534669
--------------------
The f1-micro score for the model is: 0.5479848007534669
--------------------
The f1-macro score for the model is: 0.4051005786944038
--------------------
[[ 1298  5709   110]
 [ 4378 14618   334]
 [   97  3290   957]]


In [101]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 1000, booster = 'gbtree')
xgb.fit(X_train_fs1,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [102]:
pred = xgb.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.6854806612112974
--------------------
Mean squared error: 4.736915635873007
--------------------
Root mean squared error: 2.1764456427563283
--------------------
R2 score: -0.4125926585921367
--------------------
              precision    recall  f1-score   support

    negative       0.21      0.35      0.26      5552
     neutral       0.70      0.66      0.68     23230
    positive       0.30      0.16      0.21      5240

    accuracy                           0.53     34022
   macro avg       0.40      0.39      0.38     34022
weighted avg       0.56      0.53      0.54     34022

--------------------
The accuracy of the model is: 0.5328904826288872
--------------------
The f1-micro score for the model is: 0.5328904826288872
--------------------
The f1-macro score for the model is: 0.3834667481079214
--------------------
[[ 1970  3256   326]
 [ 6338 15336  1556]
 [ 1056  3360   824]]


In [104]:
pred = xgb.predict(X_test_fs1)
score_regression(y_test, pred)
score_final(pred_test, XGBdata(pred))

Mean absolute error:  1.5728261842215472
--------------------
Mean squared error: 3.8352034530556494
--------------------
Root mean squared error: 1.9583675479990086
--------------------
R2 score: -0.30066382589672025
--------------------
              precision    recall  f1-score   support

    negative       0.29      0.28      0.28      7117
     neutral       0.63      0.67      0.65     19330
    positive       0.38      0.29      0.33      4344

    accuracy                           0.53     30791
   macro avg       0.43      0.41      0.42     30791
weighted avg       0.52      0.53      0.52     30791

--------------------
The accuracy of the model is: 0.527037121236725
--------------------
The f1-micro score for the model is: 0.527037121236725
--------------------
The f1-macro score for the model is: 0.4213581903773697
--------------------
[[ 2008  4741   368]
 [ 4675 12962  1693]
 [  333  2753  1258]]


In [105]:
xgb = XGBRegressor(tree_method = 'gpu_hist', n_estimators = 1000, booster = 'gbtree')
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, reg_lambda=1, ...)

In [108]:
pred = xgb.predict(X_validation)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.6870061346686926
--------------------
Mean squared error: 4.79552166097774
--------------------
Root mean squared error: 2.1898679551465516
--------------------
R2 score: -0.43006952480129246
--------------------
              precision    recall  f1-score   support

    negative       0.20      0.34      0.25      5552
     neutral       0.70      0.66      0.68     23230
    positive       0.34      0.16      0.21      5240

    accuracy                           0.53     34022
   macro avg       0.41      0.39      0.38     34022
weighted avg       0.56      0.53      0.54     34022

--------------------
The accuracy of the model is: 0.5299806007877256
--------------------
The f1-micro score for the model is: 0.5299806007877256
--------------------
The f1-macro score for the model is: 0.3806298393901621
--------------------
[[ 1899  3345   308]
 [ 6599 15310  1321]
 [ 1068  3350   822]]


In [109]:
pred = xgb.predict(X_test)
score_regression(y_test, pred)
score_final(pred_test, XGBdata(pred))

Mean absolute error:  1.5649901972497133
--------------------
Mean squared error: 3.8018878038062627
--------------------
Root mean squared error: 1.949843020298368
--------------------
R2 score: -0.2893652180535258
--------------------
              precision    recall  f1-score   support

    negative       0.28      0.28      0.28      7117
     neutral       0.63      0.67      0.65     19330
    positive       0.40      0.28      0.33      4344

    accuracy                           0.52     30791
   macro avg       0.43      0.41      0.42     30791
weighted avg       0.52      0.52      0.52     30791

--------------------
The accuracy of the model is: 0.5237894189860673
--------------------
The f1-micro score for the model is: 0.5237894189860673
--------------------
The f1-macro score for the model is: 0.41973850236667315
--------------------
[[ 2027  4716   374]
 [ 4957 12872  1501]
 [  377  2738  1229]]


In [91]:
knn = knnCM(n_neighbors = 370)
knn.fit(X_train_fs1, y_train)

KNeighborsRegressor()

In [83]:
pred = knn.predict(X_validation_fs1)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.2941695135545566
--------------------
Mean squared error: 3.042316425444271
--------------------
Root mean squared error: 1.74422373147606
--------------------
R2 score: 0.09275271546928976
--------------------
              precision    recall  f1-score   support

    negative       0.44      0.45      0.45      5552
     neutral       0.74      0.87      0.80     23230
    positive       0.64      0.14      0.23      5240

    accuracy                           0.69     34022
   macro avg       0.61      0.49      0.49     34022
weighted avg       0.68      0.69      0.66     34022

--------------------
The accuracy of the model is: 0.6912292046322968
--------------------
The f1-micro score for the model is: 0.6912292046322968
--------------------
The f1-macro score for the model is: 0.4947065612913914
--------------------
[[ 2504  2889   159]
 [ 2707 20266   257]
 [  433  4060   747]]


In [92]:
pred = knn.predict(X_test_fs1)
score_regression(y_test, pred)
score_final(pred_test, XGBdata(pred))

Mean absolute error:  1.369739226950198
--------------------
Mean squared error: 2.974805033840439
--------------------
Root mean squared error: 1.7247623122739084
--------------------
R2 score: -0.008869892815979608
--------------------
              precision    recall  f1-score   support

    negative       0.18      0.18      0.18      7117
     neutral       0.60      0.69      0.64     19330
    positive       0.81      0.26      0.39      4344

    accuracy                           0.51     30791
   macro avg       0.53      0.38      0.40     30791
weighted avg       0.53      0.51      0.50     30791

--------------------
The accuracy of the model is: 0.5121626449287129
--------------------
The f1-micro score for the model is: 0.5121626449287129
--------------------
The f1-macro score for the model is: 0.40433502582566533
--------------------
[[ 1278  5714   125]
 [ 5825 13376   129]
 [  104  3124  1116]]


In [93]:
knn = knnCM(n_neighbors = 370)
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [88]:
pred = knn.predict(X_validation)
score_regression(y_validation, pred)
score_final(pred_validation, XGBdata(pred))

Mean absolute error:  1.2917310278305378
--------------------
Mean squared error: 3.0239042960003726
--------------------
Root mean squared error: 1.7389376918108286
--------------------
R2 score: 0.09824338511190112
--------------------
              precision    recall  f1-score   support

    negative       0.45      0.44      0.44      5552
     neutral       0.74      0.88      0.80     23230
    positive       0.67      0.14      0.23      5240

    accuracy                           0.69     34022
   macro avg       0.62      0.49      0.49     34022
weighted avg       0.68      0.69      0.66     34022

--------------------
The accuracy of the model is: 0.6926106636882018
--------------------
The f1-micro score for the model is: 0.6926106636882018
--------------------
The f1-macro score for the model is: 0.4923785253836588
--------------------
[[ 2440  2960   152]
 [ 2622 20399   209]
 [  391  4124   725]]


In [94]:
pred = knn.predict(X_test)
score_regression(y_test, pred)
score_final(pred_test, XGBdata(pred))

Mean absolute error:  1.3685857672956314
--------------------
Mean squared error: 2.9788295904039406
--------------------
Root mean squared error: 1.7259286168332515
--------------------
R2 score: -0.010234773506533568
--------------------
              precision    recall  f1-score   support

    negative       0.16      0.16      0.16      7117
     neutral       0.59      0.69      0.64     19330
    positive       0.82      0.23      0.35      4344

    accuracy                           0.50     30791
   macro avg       0.53      0.36      0.38     30791
weighted avg       0.53      0.50      0.49     30791

--------------------
The accuracy of the model is: 0.5027443084018057
--------------------
The f1-micro score for the model is: 0.5027443084018057
--------------------
The f1-macro score for the model is: 0.3847142077970487
--------------------
[[ 1157  5866    94]
 [ 5864 13345   121]
 [   87  3279   978]]
