In [239]:
# libraries for numerical
import pandas as pd  
import numpy as np

# libraries for visualization
import matplotlib.pyplot as plt  
import seaborn as sns

# libraries for machine learning
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics

# to plot the diagrams within the cells
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [161]:
train_data = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/hippocorpus/train_set_label.csv" )
train_data.head(2).T

Unnamed: 0,0,1
AssignmentId,39DD6S19JQXBBJGPFEBWSF4G1TQEZJ,3WQQ9FUS6BGYKK3IWX9TEN1R8RLB87
WorkTimeInSeconds,1025,1987
WorkerId,YGROBIBW,KK89LEHY
annotatorAge,35,35
annotatorGender,Man,woman
annotatorRace,white,indian
distracted,one,one
draining,3.0,5.0
frequency,2,3
importance,3,5


In [162]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797 entries, 0 to 6796
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   AssignmentId       6797 non-null   object 
 1   WorkTimeInSeconds  6797 non-null   int64  
 2   WorkerId           6797 non-null   object 
 3   annotatorAge       6771 non-null   float64
 4   annotatorGender    6793 non-null   object 
 5   annotatorRace      6797 non-null   object 
 6   distracted         6797 non-null   object 
 7   draining           6797 non-null   object 
 8   frequency          4052 non-null   float64
 9   importance         6649 non-null   float64
 10  logTimeSinceEvent  6797 non-null   float64
 11  mainEvent          6797 non-null   object 
 12  mostSurprising     6797 non-null   object 
 13  openness           6797 non-null   float64
 14  recAgnPairId       2549 non-null   object 
 15  recImgPairId       5303 non-null   object 
 16  similarity         2745 

In [163]:
train_data.describe()

Unnamed: 0,WorkTimeInSeconds,annotatorAge,frequency,importance,logTimeSinceEvent,openness,similarity,stressful,timeSinceEvent
count,6797.0,6771.0,4052.0,6649.0,6797.0,6797.0,2745.0,6797.0,6797.0
mean,2072.577166,33.513661,3.643633,3.958941,7.145193,0.554252,2.929326,2.161395,3.9381600000000005e+56
std,1670.981846,10.152089,1.029724,1.261159,13.854228,0.423053,1.446355,1.322928,8.718221e+57
min,173.0,18.0,1.0,1.0,1.94591,-1.0,1.0,1.0,7.0
25%,919.0,25.0,3.0,3.0,3.555348,0.25,2.0,1.0,35.0
50%,1524.0,30.0,4.0,4.0,4.49981,0.625,3.0,2.0,90.0
75%,2632.0,40.0,4.0,5.0,5.010635,0.875,4.0,3.0,150.0
max,10680.0,55.0,5.0,5.0,136.818082,1.0,5.0,5.0,2.626263e+59


In [164]:
train_data.isnull().sum()

AssignmentId            0
WorkTimeInSeconds       0
WorkerId                0
annotatorAge           26
annotatorGender         4
annotatorRace           0
distracted              0
draining                0
frequency            2745
importance            148
logTimeSinceEvent       0
mainEvent               0
mostSurprising          0
openness                0
recAgnPairId         4248
recImgPairId         1494
similarity           4052
similarityReason     4052
story                   0
stressful               0
summary                 0
timeSinceEvent          0
memType                 0
dtype: int64

# Filling the missing values

In [264]:
# Work on the annotatorAge column
test_data['annotatorAge'].isnull().value_counts()

False    2051
True        6
Name: annotatorAge, dtype: int64

In [265]:
test_data['annotatorAge'].value_counts()

25.0    484
30.0    448
35.0    341
40.0    197
18.0    179
55.0    162
45.0    138
50.0    102
Name: annotatorAge, dtype: int64

In [266]:
# Fill the missing values
test_data['annotatorAge'].fillna(test_data['annotatorAge'].median(), inplace=True)
test_data['annotatorAge'].isnull().value_counts()

False    2057
Name: annotatorAge, dtype: int64

In [267]:
test_data['annotatorAge'].value_counts()

25.0    484
30.0    454
35.0    341
40.0    197
18.0    179
55.0    162
45.0    138
50.0    102
Name: annotatorAge, dtype: int64

In [268]:
test_data['annotatorAge'] = test_data['annotatorAge'].astype(int)
test_data['annotatorAge'].value_counts()

25    484
30    454
35    341
40    197
18    179
55    162
45    138
50    102
Name: annotatorAge, dtype: int64

In [269]:
bins= [18, 24, 29, 34, 39, 44, 49, 54, 55]
labels = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55+']
test_data['AgeGroup'] = pd.cut(test_data['annotatorAge'], bins=bins, labels=labels, right=False)
test_data.AgeGroup.value_counts()

25-29    484
30-34    454
35-39    341
40-44    197
18-24    179
45-49    138
50-54    102
55+        0
Name: AgeGroup, dtype: int64

In [270]:
# Work on the annotatorGender columns

In [271]:
test_data['annotatorGender'].value_counts()

woman         605
man           592
Man           303
Woman         277
MAN           133
WOMAN         127
na              4
nonBinary       4
Nonbinary       4
Na              3
other           1
NONBINARY       1
OTHER           1
transwoman      1
transman        1
Name: annotatorGender, dtype: int64

In [272]:
test_data['annotatorGender'].replace(to_replace =['man', 'Man', 'MAN'], value ="MALE", inplace=True) 
test_data['annotatorGender'].replace(to_replace =['woman', 'Woman', 'WOMAN'], value ="WOMAN", inplace=True) 
test_data['annotatorGender'].replace(to_replace =['transman', 'Transman', 'TRANSMAN', 'Transwoman', 'transwoman'], 
                 value ="TRANSGENDER", inplace=True) 
test_data['annotatorGender'].replace(to_replace =['nonBinary', 'na', 'Na', 'Nonbinary', 'NONBINARY', 'other', 'OTHER'], 
                 value =np.nan, inplace=True) 
test_data['annotatorGender'].replace(to_replace = np.nan, 
                 value = 'TRANSGENDER', inplace=True)
test_data.annotatorGender.value_counts()

MALE           1028
WOMAN          1009
TRANSGENDER      20
Name: annotatorGender, dtype: int64

In [273]:
test_data['annotatorGender'].isnull().value_counts()

False    2057
Name: annotatorGender, dtype: int64

In [175]:
test_data['annotatorAge'].fillna(test_data['annotatorAge'].mean(), inplace=True)
test_data['frequency'].fillna(test_data['frequency'].mean(), inplace=True)
test_data['importance'].fillna(test_data['importance'].mean(), inplace=True)

NameError: name 'test_data' is not defined

In [274]:
# Work on the annotatorRace columns
test_data['annotatorRace'].value_counts()

white            1078
White             422
black             148
asian              90
hisp               80
Black              70
Hisp               41
other              35
Asian              30
Other              15
native             15
na                 13
indian              6
Indian              5
islander            3
Na                  3
Middleeastern       1
middleEastern       1
Native              1
Name: annotatorRace, dtype: int64

In [275]:
test_data['annotatorRace'].replace(to_replace =['white', 'White'], 
                 value ="WHITE", inplace=True)
test_data['annotatorRace'].replace(to_replace =['black', 'Black'], 
                 value ="BLACK", inplace=True)
test_data['annotatorRace'].replace(to_replace =['asian', 'Asian', 'middleEastern', 'Middleeastern', 'indian', 'Indian'],
                 value ="ASIAN", inplace=True)
test_data['annotatorRace'].replace(to_replace =['Na', 'na', 'Other', 'other'], 
                 value ="OTHERS", inplace=True)
test_data['annotatorRace'].replace(to_replace =['hisp', 'Hisp'], 
                 value ="HISPANIC", inplace=True)
test_data['annotatorRace'].replace(to_replace =['native', 'Native'], 
                 value ="NATIVE AMERICANS", inplace=True)
test_data['annotatorRace'].replace(to_replace =['islander', 'Islander'], 
                 value ="ISLANDER", inplace=True)
test_data['annotatorRace'].value_counts()

WHITE               1500
BLACK                218
ASIAN                133
HISPANIC             121
OTHERS                66
NATIVE AMERICANS      16
ISLANDER               3
Name: annotatorRace, dtype: int64

In [276]:
test_data['annotatorRace'].isnull().sum()

0

In [277]:
# Work on the distracted column
test_data['distracted'].value_counts()

one    1539
2.0     294
3.0     107
4.0      65
5.0      52
Name: distracted, dtype: int64

In [278]:
test_data['distracted'].replace(to_replace =['one'], 
                 value =1.0, inplace=True)

In [279]:
test_data['distracted'] = test_data['distracted'].astype(float)

In [280]:
# Work on the draining columns
test_data['draining'].value_counts()

one    898
2.0    389
3.0    339
4.0    272
5.0    159
Name: draining, dtype: int64

In [281]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   AssignmentId       2057 non-null   object  
 1   WorkTimeInSeconds  2057 non-null   int64   
 2   WorkerId           2057 non-null   object  
 3   annotatorAge       2057 non-null   int32   
 4   annotatorGender    2057 non-null   object  
 5   annotatorRace      2057 non-null   object  
 6   distracted         2057 non-null   float64 
 7   draining           2057 non-null   object  
 8   frequency          1230 non-null   float64 
 9   importance         2018 non-null   float64 
 10  logTimeSinceEvent  2057 non-null   float64 
 11  mainEvent          2057 non-null   object  
 12  mostSurprising     2057 non-null   object  
 13  openness           2057 non-null   float64 
 14  recAgnPairId       811 non-null    object  
 15  recImgPairId       1595 non-null   object  
 16  simila

In [282]:
test_data['draining'].replace(to_replace =['one'], 
                 value =1.0, inplace=True) 
test_data['draining'] = test_data['draining'].astype(float)

In [283]:
# Work on the frequency column
test_data.frequency.value_counts()

4.0    441
3.0    318
5.0    290
2.0    164
1.0     17
Name: frequency, dtype: int64

In [284]:
test_data.frequency.isnull().sum()

827

In [285]:
test_data['frequency'].fillna(test_data['frequency'].median(), inplace=True)

In [286]:
# Work on the importance columns
test_data.importance.value_counts()

5.0    924
4.0    501
3.0    303
1.0    157
2.0    133
Name: importance, dtype: int64

In [287]:
test_data.importance.isnull().sum()

39

In [288]:
test_data['importance'].fillna(test_data['importance'].median(), inplace=True)

In [290]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   AssignmentId       2057 non-null   object  
 1   WorkTimeInSeconds  2057 non-null   int64   
 2   WorkerId           2057 non-null   object  
 3   annotatorAge       2057 non-null   int32   
 4   annotatorGender    2057 non-null   object  
 5   annotatorRace      2057 non-null   object  
 6   distracted         2057 non-null   float64 
 7   draining           2057 non-null   float64 
 8   frequency          2057 non-null   float64 
 9   importance         2057 non-null   float64 
 10  logTimeSinceEvent  2057 non-null   float64 
 11  mainEvent          2057 non-null   object  
 12  mostSurprising     2057 non-null   object  
 13  openness           2057 non-null   float64 
 14  recAgnPairId       811 non-null    object  
 15  recImgPairId       1595 non-null   object  
 16  simila

In [291]:
# Work on the similarity column
test_data['similarity'].value_counts()

1.0    206
4.0    185
3.0    150
2.0    143
5.0    143
Name: similarity, dtype: int64

In [292]:
test_data['similarity'].isnull().sum()

1230

In [293]:
test_data['similarity'].fillna(test_data['similarity'].median(), inplace=True)

In [294]:
test_data['similarity'].isnull().sum()

0

In [295]:
# Work on the stressfull column
test_data['stressful'].value_counts()

1.0    927
2.0    398
3.0    292
4.0    273
5.0    167
Name: stressful, dtype: int64

In [296]:
test_data['stressful'].isnull().sum()

0

In [297]:
test_data.timeSinceEvent.value_counts()

9.000000e+01    338
6.000000e+01    264
1.200000e+02    246
1.500000e+02    160
1.800000e+02    138
3.000000e+01    135
7.000000e+00    133
1.400000e+01    110
2.100000e+01     84
2.100000e+02     48
2.800000e+01     44
2.400000e+02     37
4.200000e+01     33
2.700000e+02     25
3.500000e+01     19
2.222222e+29     18
4.444444e+29     17
3.600000e+02     16
3.333333e+29     13
5.600000e+01     13
4.900000e+01     12
3.000000e+02     12
5.555556e+29     11
1.111111e+06     10
7.800000e+02      9
2.222222e+06      8
3.300000e+02      8
1.111111e+29      8
7.200000e+02      7
4.500000e+02      6
7.000000e+01      6
6.666667e+29      6
8.400000e+01      6
1.820000e+02      5
6.300000e+01      5
3.333333e+06      5
4.444444e+06      4
9.800000e+01      3
4.200000e+02      3
1.260000e+02      3
6.000000e+02      3
3.900000e+02      3
1.680000e+02      3
1.120000e+02      2
5.555555e+06      2
5.400000e+02      2
6.600000e+02      2
7.700000e+01      2
7.500000e+02      2
6.900000e+02      1


In [298]:
test_data.timeSinceEvent.isnull().sum()

0

In [299]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   AssignmentId       2057 non-null   object  
 1   WorkTimeInSeconds  2057 non-null   int64   
 2   WorkerId           2057 non-null   object  
 3   annotatorAge       2057 non-null   int32   
 4   annotatorGender    2057 non-null   object  
 5   annotatorRace      2057 non-null   object  
 6   distracted         2057 non-null   float64 
 7   draining           2057 non-null   float64 
 8   frequency          2057 non-null   float64 
 9   importance         2057 non-null   float64 
 10  logTimeSinceEvent  2057 non-null   float64 
 11  mainEvent          2057 non-null   object  
 12  mostSurprising     2057 non-null   object  
 13  openness           2057 non-null   float64 
 14  recAgnPairId       811 non-null    object  
 15  recImgPairId       1595 non-null   object  
 16  simila

In [300]:
test_data.notnull().sum()

AssignmentId         2057
WorkTimeInSeconds    2057
WorkerId             2057
annotatorAge         2057
annotatorGender      2057
annotatorRace        2057
distracted           2057
draining             2057
frequency            2057
importance           2057
logTimeSinceEvent    2057
mainEvent            2057
mostSurprising       2057
openness             2057
recAgnPairId          811
recImgPairId         1595
similarity           2057
similarityReason      827
story                2057
stressful            2057
summary              2057
timeSinceEvent       2057
AgeGroup             1895
dtype: int64

In [301]:
# Workon agegroup column
test_data.AgeGroup.value_counts()

25-29    484
30-34    454
35-39    341
40-44    197
18-24    179
45-49    138
50-54    102
55+        0
Name: AgeGroup, dtype: int64

In [302]:
test_data.AgeGroup.isnull().sum()

162

In [303]:
test_data['AgeGroup'].fillna(test_data['AgeGroup'].value_counts().index[0], inplace=True)

In [305]:
test_data.isnull().sum()

AssignmentId            0
WorkTimeInSeconds       0
WorkerId                0
annotatorAge            0
annotatorGender         0
annotatorRace           0
distracted              0
draining                0
frequency               0
importance              0
logTimeSinceEvent       0
mainEvent               0
mostSurprising          0
openness                0
recAgnPairId         1246
recImgPairId          462
similarity              0
similarityReason     1230
story                   0
stressful               0
summary                 0
timeSinceEvent          0
AgeGroup                0
dtype: int64

In [306]:
test_data.isna().sum()

AssignmentId            0
WorkTimeInSeconds       0
WorkerId                0
annotatorAge            0
annotatorGender         0
annotatorRace           0
distracted              0
draining                0
frequency               0
importance              0
logTimeSinceEvent       0
mainEvent               0
mostSurprising          0
openness                0
recAgnPairId         1246
recImgPairId          462
similarity              0
similarityReason     1230
story                   0
stressful               0
summary                 0
timeSinceEvent          0
AgeGroup                0
dtype: int64

In [307]:
test_data.shape

(2057, 23)

In [308]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   AssignmentId       2057 non-null   object  
 1   WorkTimeInSeconds  2057 non-null   int64   
 2   WorkerId           2057 non-null   object  
 3   annotatorAge       2057 non-null   int32   
 4   annotatorGender    2057 non-null   object  
 5   annotatorRace      2057 non-null   object  
 6   distracted         2057 non-null   float64 
 7   draining           2057 non-null   float64 
 8   frequency          2057 non-null   float64 
 9   importance         2057 non-null   float64 
 10  logTimeSinceEvent  2057 non-null   float64 
 11  mainEvent          2057 non-null   object  
 12  mostSurprising     2057 non-null   object  
 13  openness           2057 non-null   float64 
 14  recAgnPairId       811 non-null    object  
 15  recImgPairId       1595 non-null   object  
 16  simila

In [309]:
test_data.memType.value_counts()

AttributeError: 'DataFrame' object has no attribute 'memType'

In [211]:
memtype_map  = {'recalled':1,
              'imagined':2,
              'retold':3 }
train_data['memType'] = train_data.memType.map(memtype_map)
train_data.memType.value_counts()

1    2760
2    2745
3    1292
Name: memType, dtype: int64

In [310]:
pd.isnull(test_data).sum()

AssignmentId            0
WorkTimeInSeconds       0
WorkerId                0
annotatorAge            0
annotatorGender         0
annotatorRace           0
distracted              0
draining                0
frequency               0
importance              0
logTimeSinceEvent       0
mainEvent               0
mostSurprising          0
openness                0
recAgnPairId         1246
recImgPairId          462
similarity              0
similarityReason     1230
story                   0
stressful               0
summary                 0
timeSinceEvent          0
AgeGroup                0
dtype: int64

In [311]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   AssignmentId       2057 non-null   object  
 1   WorkTimeInSeconds  2057 non-null   int64   
 2   WorkerId           2057 non-null   object  
 3   annotatorAge       2057 non-null   int32   
 4   annotatorGender    2057 non-null   object  
 5   annotatorRace      2057 non-null   object  
 6   distracted         2057 non-null   float64 
 7   draining           2057 non-null   float64 
 8   frequency          2057 non-null   float64 
 9   importance         2057 non-null   float64 
 10  logTimeSinceEvent  2057 non-null   float64 
 11  mainEvent          2057 non-null   object  
 12  mostSurprising     2057 non-null   object  
 13  openness           2057 non-null   float64 
 14  recAgnPairId       811 non-null    object  
 15  recImgPairId       1595 non-null   object  
 16  simila

In [312]:
numeric_data = test_data[['WorkTimeInSeconds', 'annotatorAge', 'distracted', 'draining', 'frequency', 'importance', 'logTimeSinceEvent', 'openness', 'stressful', 'timeSinceEvent']]

In [314]:
categorical_data = test_data.drop(numeric_data.columns, axis=1)
categorical_data.nunique()

AssignmentId        2057
WorkerId            1514
annotatorGender        3
annotatorRace          7
mainEvent           2004
mostSurprising      2037
recAgnPairId         679
recImgPairId        1341
similarity             5
similarityReason     769
story               2057
summary             1582
AgeGroup               7
dtype: int64

In [315]:
test_data = pd.get_dummies(test_data, columns= ['annotatorGender', 'annotatorRace', 'similarity', 'AgeGroup'], drop_first=True)

In [316]:
# Delete the unwanted columns from the dataframe
test_data.drop(['AssignmentId', 'WorkerId', 'annotatorAge', 'mainEvent', 'mostSurprising', 'recAgnPairId', 'recImgPairId',
                 'similarityReason', 'story', 'summary'], axis=1, inplace=True)

In [317]:
count = np.isinf(test_data).values.sum() 
print("It contains " + str(count) + " infinite values") 

It contains 0 infinite values


In [226]:
# counting infinity in a particular column name 
c = np.isinf(train_data['memType']).values.sum() 
print("It contains " + str(c) + " infinite values") 
  
# printing column name where infinity is present 
print() 
print("printing column name where infinity is present") 
col_name = train_data.columns.to_series()[np.isinf(train_data).any()] 
print(col_name) 
  
# printing row index with infinity 
print() 
print("printing row index with infinity ") 
  
r = train_data.index[np.isinf(train_data).any(1)] 
print(r) 

It contains 0 infinite values

printing column name where infinity is present
Series([], dtype: object)

printing row index with infinity 
Int64Index([], dtype='int64')


In [318]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   WorkTimeInSeconds               2057 non-null   int64  
 1   distracted                      2057 non-null   float64
 2   draining                        2057 non-null   float64
 3   frequency                       2057 non-null   float64
 4   importance                      2057 non-null   float64
 5   logTimeSinceEvent               2057 non-null   float64
 6   openness                        2057 non-null   float64
 7   stressful                       2057 non-null   float64
 8   timeSinceEvent                  2057 non-null   float64
 9   annotatorGender_TRANSGENDER     2057 non-null   uint8  
 10  annotatorGender_WOMAN           2057 non-null   uint8  
 11  annotatorRace_BLACK             2057 non-null   uint8  
 12  annotatorRace_HISPANIC          20

In [228]:
X = train_data.drop('memType', axis=1)
y = train_data['memType']

In [343]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5437, 28), (5437,), (1360, 28), (1360,))

In [344]:
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [345]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [346]:
lr.score(X_test, y_test)

0.40441176470588236

In [347]:
# Model evaluation for training set
y_pred = lr.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
r2 = r2_score(y_test, y_pred)

# Examine the first 10 predicted output from the model
output = pd.DataFrame(y_train[0:10])
output['Predicted'] = y_pred[0:10]
output['Difference'] = output['Predicted'] - output['memType']
print(output, "\n")

print("Model training performance:")
print("---------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

      memType  Predicted  Difference
5476        1          1           0
3697        3          1          -2
1536        2          1          -1
1399        2          1          -1
1060        1          1           0
870         2          1          -1
4516        3          1          -2
582         2          1          -1
6373        3          1          -2
2971        1          1           0 

Model training performance:
---------------------------
RMSE is 1.0596197986292342
R2 score is -1.127091657542258




In [348]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accu = accuracy_score(y_test, y_pred)
print("Test set Accuracy: {:.3f}%".format(accu*100))

[[550   0   0]
 [571   0   0]
 [239   0   0]]
Test set Accuracy: 40.441%


In [262]:
test_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/hippocorpus/test_set_label.csv')
test_data.head()

Unnamed: 0,AssignmentId,WorkTimeInSeconds,WorkerId,annotatorAge,annotatorGender,annotatorRace,distracted,draining,frequency,importance,...,mostSurprising,openness,recAgnPairId,recImgPairId,similarity,similarityReason,story,stressful,summary,timeSinceEvent
0,386PBUZZXGJZHALDEEVEHG6BE9EJLD,4679,K10CDHDP,18.0,Man,hisp,one,2.0,,3.0,...,the doctor saw abnormal activity but the diagn...,-0.5,,3VFJCI1K40L0T6QPBS776FNTU1ORGT,4.0,"I've had headaches in the past, have had to wa...",12 weeks ago I had to visit the hospital for m...,3.0,Went to the hospital to get a diagnoses. This ...,84.0
1,3C8HJ7UOP8G9N782WCZF5N9PDLQMZ8,1392,KNOE8VGT,18.0,woman,asian,one,one,3.0,1.0,...,I lied about my failures,0.125,3570Y55XZQ59Q72BPNZ1EBGQYSVYG3,,,,"Well, four months ago, I felt extremely embarr...",1.0,"I talked with my aunt about my future, and I k...",120.0
2,3WR9XG3T64XAZ7DQ7D70S9ZQCVZ47T,1321,NZXIA1OJ,55.0,woman,White,one,one,5.0,5.0,...,She dedicated a song to me.,1.0,,3WR9XG3T64XAZ7DQ7D70S9ZQCVZ47T,,,The event that just happened was my daughter's...,1.0,Her wedding dress was beautiful with lots of p...,2.222222e+29
3,3VD82FOHKRAKFHI4HEUKMTX9CYJOCT,1583,18G32XSD,35.0,woman,white,5.0,3.0,2.0,5.0,...,the students reacted compassionately.,1.0,3PMBY0YE28PH8UEYFPB2P32KZY4C9Q,,,,When I was subbing at one of my regular school...,3.0,I found out my friend died from brain cancer w...,210.0
4,37TD41K0AIVZD9AXOVBT6Y7AR0OSC4,920,2DTJ9KGG,25.0,man,white,one,one,,2.0,...,I had to raise my brother as if I were his fat...,1.0,,3E1QT0TDFQV87SHO271A0A54SX6I89,1.0,I don't identify with the story since nothing ...,Growing up I wasn’t much like all of my friend...,1.0,Basically growing up was never a best experien...,300.0


In [263]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2057 entries, 0 to 2056
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   AssignmentId       2057 non-null   object 
 1   WorkTimeInSeconds  2057 non-null   int64  
 2   WorkerId           2057 non-null   object 
 3   annotatorAge       2051 non-null   float64
 4   annotatorGender    2057 non-null   object 
 5   annotatorRace      2057 non-null   object 
 6   distracted         2057 non-null   object 
 7   draining           2057 non-null   object 
 8   frequency          1230 non-null   float64
 9   importance         2018 non-null   float64
 10  logTimeSinceEvent  2057 non-null   float64
 11  mainEvent          2057 non-null   object 
 12  mostSurprising     2057 non-null   object 
 13  openness           2057 non-null   float64
 14  recAgnPairId       811 non-null    object 
 15  recImgPairId       1595 non-null   object 
 16  similarity         827 n

In [323]:
from boruta import BorutaPy

In [None]:
test_data.isnull().sum()

In [None]:
test_data.round(decimals=2)
test_data.describe()

In [None]:
test_rl_pred = lr.predict(test_data)
test_rl_pred

In [None]:
target = lr.predict(test_data)

res = pd.DataFrame(target) #target is nothing but the final predictions of your model on input features of your new unseen test data
res.index = test_data.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ['prediction']
res.to_csv("submission_sprint_20.csv") # the csv file will be saved locally on the same location where this notebook is located.

In [None]:
pred_data = pd.read_csv('submission_sprint_20.csv')
pred_data.head()

In [None]:
memtype_map  = {1:'recalled',
              2:'imagined',
              3:'retold' }
res['prediction'] = res.prediction.map(memtype_map)
res.head()

In [None]:
res.columns = ['prediction']
res.to_csv("submission_sprint_20.csv")

In [338]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

In [349]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred_log_reg = clf.predict(X_test)
acc_log_reg = round( clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_log_reg) + '%')

Train Accuracy: 40.65%


In [350]:
clf = SVC()
clf.fit(X_train, y_train)
y_pred_svc = clf.predict(X_test)
acc_svc = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_svc) + '%')

Train Accuracy: 40.65%


In [351]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred_linear_svc = clf.predict(X_test)
acc_linear_svc = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_linear_svc) + '%')

Train Accuracy: 74.88%


In [352]:
kn_clf = KNeighborsClassifier(n_neighbors = 2)
kn_clf.fit(X_train, y_train)
y_pred_knn = kn_clf.predict(X_test)
acc_knn = round(clf.score(X_train, y_train) * 100, 2)
print ("Train Accuracy: " + str(acc_knn) + '%')

Train Accuracy: 74.88%


In [360]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.24.4-cp38-none-win_amd64.whl (65.4 MB)
Collecting graphviz
  Downloading graphviz-0.16-py2.py3-none-any.whl (19 kB)
Collecting plotly
  Downloading plotly-4.14.1-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11434 sha256=3a7c2d9a9298e5150747d315030cfb3728c7dcb63309fd256c30b998140c2769
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\c4\a7\48\0a434133f6d56e878ca511c0e6c38326907c0792f67b476e56
Successfully built retrying
Installing collected packages: graphviz, retrying, plotly, catboost
Successfully installed catboost-0.24.4 graphviz-0.16 plotly-4.14.1 retrying-1.3.3


In [366]:
#!pip uninstall catboost

^C


In [368]:
from catboost import CatBoostClassifier, Pool

In [369]:
cat_model = CatBoostClassifier(
    iterations = 1000, # 1000 are ideal
    loss_function='MultiClass',
    bootstrap_type = "Bayesian",
    eval_metric = 'MultiClass',
    leaf_estimation_iterations = 100,
    random_strength = 0.5,
    depth = 7,
    l2_leaf_reg = 5,
    learning_rate=0.1,
    bagging_temperature = 0.5,
    task_type = "GPU",
)

In [370]:
# training the model
cat_model.fit(X_train,y_train)

CatBoostError: c:/program files (x86)/go agent/pipelines/buildmaster/catboost.git/catboost/cuda/cuda_lib/cuda_manager.cpp:201: Condition violated: `State == nullptr'

In [None]:
# predicting the model output
y_pred_cat = cat_model.predict(X_test)
# printing the accuracy of the tuned model
print("accuracy of the catboost: ",accuracy_score(y_test,y_pred_cat))

# confusion metrics of the LightGBM and plotting the same
confusion_matrix_LightGBM = confusion_matrix(y_test,y_pred_cat)
print(confusion_matrix_LightGBM) 

In [372]:
# Model evaluation for training set
from sklearn.metrics import f1_score
y_pred = kn_clf.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
r2 = r2_score(y_test, y_pred)

# Examine the first 10 predicted output from the model
output = pd.DataFrame(y_train[0:10])
output['Predicted'] = y_pred[0:10]
output['Difference'] = output['Predicted'] - output['memType']
print(output, "\n")

print("Model training performance:")
print("---------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
#f1 = f1_score(y_test, y_pred)
#print('F1 score: %f' % f1)
print("\n")

      memType  Predicted  Difference
5476        1          1           0
3697        3          1          -2
1536        2          3           1
1399        2          2           0
1060        1          3           2
870         2          1          -1
4516        3          2          -1
582         2          1          -1
6373        3          2          -1
2971        1          1           0 

Model training performance:
---------------------------
RMSE is 0.7845005904845377
R2 score is -0.16593039774909624




In [354]:
test_pred = kn_clf.predict(test_data)
test_pred

array([2, 1, 1, ..., 2, 2, 2], dtype=int64)

In [355]:
target = kn_clf.predict(test_data)

res = pd.DataFrame(target) #target is nothing but the final predictions of your model on input features of your new unseen test data
res.index = test_data.index # its important for comparison. Here "test_new" is your new test dataset
res.columns = ['prediction']
res.to_csv("submission_kn_sprint_20.csv") # the csv file will be saved locally on the same location where this notebook is located.

In [356]:
memtype_map  = {1:'recalled',
              2:'imagined',
              3:'retold' }
res['prediction'] = res.prediction.map(memtype_map)
res.head()

Unnamed: 0,prediction
0,imagined
1,recalled
2,recalled
3,retold
4,retold


In [357]:
res.columns = ['prediction']
res.to_csv("submission_kn_sprint_20.csv")

In [326]:
rfc = KNeighborsClassifier()
boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)   # initialize the boruta selector
boruta_selector.fit(np.array(X_train), np.array(y_train))       # fitting the boruta selector to get all relavent features. 
# NOTE: BorutaPy accepts numpy arrays only.

KeyError: 'max_depth'