In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

In [None]:
# parameters
sample_size = -1 # if -1, then all rows are processed

features = { # true includes in the model
    'd_chall_score': True, 
    'aoa_mean': False, 
    'aoa_min': False, 
    'aoa_max': True,
    'conc_rating_mean': False, 
    'conc_rating_min': False, 
    'conc_rating_max': True, 
    'num_lemmas': True,
    'tf-idf': False
}

In [None]:
train = pd.read_csv('assets/Training_set.csv', delimiter='\t')
del train['Unnamed: 0']

test = pd.read_csv('assets/Testing_set.csv', delimiter='\t')
del test['Unnamed: 0']

# embed = pd.read_csv('/content/drive/Shareddrives/Milestone 2/embeddings.csv', delimiter='\t')

In [None]:
included_features = [k for k,v in features.items() if v and k != 'tf-idf']
if features['tf-idf']: included_features.append('lemmatized_text')

n = len(train) if sample_size == -1 else sample_size

X_train = train[included_features].sample(n)
y_train = train.iloc[X_train.index, -1]
X_test = test[included_features]
y_test = test.iloc[:, -1]

In [None]:
if features['tf-idf']:
    tfidf = TfidfVectorizer(min_df=25, stop_words='english', ngram_range=(1,3))
    X_train_vec = tfidf.fit_transform(X_train['lemmatized_text'])
    X_test_vec = tfidf.transform(X_test['lemmatized_text'])

    X_train_mads = csr_matrix(X_train.iloc[:, 1:-1])
    X_test_mads = csr_matrix(X_test.iloc[:, 1:-1])

    X_train = hstack((X_train_vec, X_train_mads))
    X_test = hstack((X_test_vec, X_test_mads))

try:
    del X_train['lemmatized_text'] 
    del X_test['lemmatized_text']
except:
    pass

In [None]:
# pipe = Pipeline([('scale', StandardScaler(with_mean=False)), ('impute', SimpleImputer()), ('svc', SVC())])


# svc = SVC()
# svc.fit(X_train_all, y_train)
# pred = svc.predict(X_test_all)
# classification_report(y_test, pred)
# pipe.fit(X_train_vec, y_train)
# pipe.score(X_test_vec, y_test)


In [83]:
# imputer = SimpleImputer()
# scaler = StandardScaler()

# X_train_imputed = imputer.fit_transform(X_train)
# X_test_imputed = imputer.transform(X_test)

# X_train_scaled = scaler.fit_transform(X_train_imputed)
# X_test_scaled = scaler.fit_transform(X_test_imputed)

# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['linear','poly','rbf']}

# grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

# grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.621 total time=   2.9s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.625 total time=   3.4s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.623 total time=   3.3s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.604 total time=   3.1s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.637 total time=   3.6s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.590 total time=   6.2s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.594 total time=   6.7s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.584 total time=   6.6s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.590 total time=   7.2s
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.584 total time=   6.4s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.618 total time=   6.6s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf

In [None]:
# print('Best Paramters: ', grid.best_params_)
# print('Best Model: ', grid.best_estimator_)

# pred = grid.predict(X_test_scaled)
# classification_report(y_test, pred)

In [None]:
# import pickle

# s = pickle.dumps(pipe['svc'])