In [77]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler

# Reproduce the NCT model in python

In [78]:
# load training data - themes
train = pd.read_csv('../../datasets/nct_datasets/tavern_factors.csv')
train_df = train[['duration','onOffBeat','ArrSkiSteLeap','DepSkiSteLeap','Response']]
train_df.head()

Unnamed: 0,duration,onOffBeat,ArrSkiSteLeap,DepSkiSteLeap,Response
0,2.0,onbeat,unison,skip,1
1,1.0,onbeat,skip,step,0
2,0.5,onbeat,step,step,0
3,0.5,offbeat,step,leap,1
4,1.0,onbeat,leap,step,0


In [79]:
#load testing data - haydn
test = pd.read_csv('../../datasets/nct_datasets/haydn_factors.csv')
test_df = test[['duration','onOffBeat','ArrSkiSteLeap','DepSkiSteLeap','Response']]
test_df.head()

Unnamed: 0,duration,onOffBeat,ArrSkiSteLeap,DepSkiSteLeap,Response
0,1.0,onbeat,unison,step,1
1,0.5,onbeat,step,skip,0
2,0.5,offbeat,skip,step,0
3,1.0,onbeat,step,step,1
4,0.5,onbeat,step,skip,0


In [80]:
#pre-processing 
mint = {'unison':'unison','skip': 'leap','step':'step','leap':'leap'}
train_df['ArrSkiSteLeap'] = train_df['ArrSkiSteLeap'].apply(lambda x: mint[x])
train_df['DepSkiSteLeap'] = train_df['DepSkiSteLeap'].apply(lambda x: mint[x])
test_df['ArrSkiSteLeap'] = test_df['ArrSkiSteLeap'].apply(lambda x: mint[x])
test_df['DepSkiSteLeap'] = test_df['DepSkiSteLeap'].apply(lambda x: mint[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [81]:
X_train = train_df.drop('Response', axis=1)
y_train = train_df['Response']
X_test = test_df.drop('Response', axis=1)
y_test = test_df['Response']

In [82]:
# transfer dummy variables into one-hot encoding - training set
X_train_en = pd.get_dummies(X_train, columns=['onOffBeat'], drop_first=False)
X_train_en1 = pd.get_dummies(X_train_en, columns=['ArrSkiSteLeap'], drop_first=False)
X_train_en2 = pd.get_dummies(X_train_en1, columns=['DepSkiSteLeap'], drop_first=False)
X_train = X_train_en2
X_train.head()

Unnamed: 0,duration,onOffBeat_offbeat,onOffBeat_onbeat,ArrSkiSteLeap_leap,ArrSkiSteLeap_step,ArrSkiSteLeap_unison,DepSkiSteLeap_leap,DepSkiSteLeap_step,DepSkiSteLeap_unison
0,2.0,0,1,0,0,1,1,0,0
1,1.0,0,1,1,0,0,0,1,0
2,0.5,0,1,0,1,0,0,1,0
3,0.5,1,0,0,1,0,1,0,0
4,1.0,0,1,1,0,0,0,1,0


In [83]:
# transfer dummy variables into one-hot encoding - test set
X_test_en = pd.get_dummies(X_test, columns=['onOffBeat'], drop_first=False)
X_test_en1 = pd.get_dummies(X_test_en, columns=['ArrSkiSteLeap'], drop_first=False)
X_test_en2 = pd.get_dummies(X_test_en1, columns=['DepSkiSteLeap'], drop_first=False)
X_test = X_test_en2
X_test.head()

Unnamed: 0,duration,onOffBeat_offbeat,onOffBeat_onbeat,ArrSkiSteLeap_leap,ArrSkiSteLeap_step,ArrSkiSteLeap_unison,DepSkiSteLeap_leap,DepSkiSteLeap_step,DepSkiSteLeap_unison
0,1.0,0,1,0,0,1,0,1,0
1,0.5,0,1,0,1,0,1,0,0
2,0.5,1,0,1,0,0,0,1,0
3,1.0,0,1,0,1,0,0,1,0
4,0.5,0,1,0,1,0,1,0,0


In [96]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [97]:
#add interactions & train 

poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias = False)
poly.fit_transform(X_train)
logit_model = LogisticRegression()
logit_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [98]:
y_pred = logit_model.predict(X_test)

In [99]:
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy_percentage = 100 * accuracy
accuracy_percentage

71.06848446417247

In [100]:
print(metrics.f1_score(y_test, y_pred, average="micro"))
print(metrics.precision_score(y_test, y_pred, average="micro"))
print(metrics.recall_score(y_test, y_pred, average="micro"))
print(metrics.confusion_matrix(y_test, y_pred))

0.7106848446417247
0.7106848446417248
0.7106848446417248
[[1774 2486]
 [1164 7192]]


In [101]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.60      0.42      0.49      4260
           1       0.74      0.86      0.80      8356

    accuracy                           0.71     12616
   macro avg       0.67      0.64      0.65     12616
weighted avg       0.70      0.71      0.69     12616

