In [1]:
import sys
sys.path.append('../..')
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('../../datas/prepared/prepared.csv', usecols=['corpus','class'])
df['corpus'] = df['corpus'].apply(lambda x: x[1:-1].replace("'", "").split(', '))

TF-IDF

In [3]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df['corpus'].apply(lambda x: ' '.join(x)))
Y = df['class'].values

In [23]:
pd.Series(Y[:10000]).value_counts()

0    5085
1    4915
Name: count, dtype: int64

In [24]:
X_down_sampled, Y_down_sampled = X[:10000], Y[:10000]
X_train_down_sampled, X_test_down_sampled, y_train_down_sampled, y_test_down_sampled = train_test_split(
    X_down_sampled, Y_down_sampled, test_size=0.2, random_state=42)

In [25]:
models_config = {
    "naive_bayes": MultinomialNB(),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, max_depth=10, 
        random_state=0),
    "svm": SVC(gamma='scale'),
    "knn": KNeighborsClassifier(),
    "logistic_regression": LogisticRegression(
        random_state=0),
}

In [26]:
for model_name, model in models_config.items():
    print(f"Model: {model_name}")
    model.fit(X_train_down_sampled, y_train_down_sampled)
    print(f"Training score: {model.score(X_train_down_sampled, y_train_down_sampled)}")
    print(f"Testing score: {model.score(X_test_down_sampled, y_test_down_sampled)}")
    print(f"Cross validation score: {cross_val_score(model, X_down_sampled, Y_down_sampled, cv=5).mean()}")
    print()

Model: naive_bayes
Training score: 0.833
Testing score: 0.7665
Cross validation score: 0.7779

Model: RandomForest
Training score: 0.828875
Testing score: 0.8065
Cross validation score: 0.805

Model: svm
Training score: 0.988375
Testing score: 0.9035
Cross validation score: 0.9020999999999999

Model: knn
Training score: 0.511875
Testing score: 0.516
Cross validation score: 0.5089

Model: logistic_regression
Training score: 0.931125
Testing score: 0.895
Cross validation score: 0.8985000000000001



In [27]:
model = models_config['naive_bayes']
parameters = {'alpha': [0.01, 0.1, 0.5, 1, 2]}
gs_clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1)
gs_clf.fit(X, Y)
print(gs_clf.best_estimator_)
print(f"Cross validation score: {cross_val_score(gs_clf.best_estimator_, X, Y, cv=5).mean()}")

MultinomialNB(alpha=0.1)
Cross validation score: 0.9065678732251928


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

In [43]:
model = LogisticRegression(
  C=3, max_iter=100, 
  solver='sag',
  random_state=0,
  multi_class='multinomial')

model.fit(X_train, y_train)
print(f"Training score: {model.score(X_train, y_train)}")
print(f"Testing score: {model.score(X_test, y_test)}")
print(f"Cross validation score: {cross_val_score(model, X, Y, cv=5).mean()}")

Training score: 0.9578892923720138
Testing score: 0.9352398603508469
Cross validation score: 0.9335367770945837
