In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import TreebankWordTokenizer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd

### Read benchmark datasets

In [2]:
df = pd.read_csv("3_SML_benchmark.csv")
df.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)

In [3]:
len(df)

480

In [4]:
df["RA"].value_counts()

1    248
0    232
Name: RA, dtype: int64

### Classifier training

In [5]:
X_news, news_test, y_news, y_test = train_test_split(df["text_clean"], df["RA"],test_size=0.15, random_state=42)

In [6]:
news_train, news_val, y_train, y_val = train_test_split(X_news, y_news,test_size=0.2, random_state=42)

In [7]:
print(f"We have {len(news_train)} train, {len(news_val)} validation and {len(news_test)} test cases.")

We have 326 train, 82 validation and 72 test cases.


In [8]:
configurations = [('NB with Count', CountVectorizer(min_df=5, max_df=.75), MultinomialNB()),
                 ('NB with TfIdf', TfidfVectorizer(min_df=5, max_df=.75), MultinomialNB()),
                 ('LogReg with Count', CountVectorizer(min_df=5, max_df=.75), LogisticRegression()),
                 ('LogReg with TfIdf', TfidfVectorizer(min_df=5, max_df=.75), LogisticRegression()),
                 ('SVC with Count', CountVectorizer(min_df=5, max_df=.75), SVC()),
                 ('SVC with TfIdf', TfidfVectorizer(min_df=5, max_df=.75), SVC())]

In [9]:
for description, vectorizer, classifier in configurations:
    print(description)
    X_train = vectorizer.fit_transform(news_train)
    X_val = vectorizer.transform(news_val)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_val)
    print(metrics.classification_report(y_val, y_pred))
    print('\n')

NB with Count
              precision    recall  f1-score   support

           0       0.91      0.78      0.84        40
           1       0.81      0.93      0.87        42

    accuracy                           0.85        82
   macro avg       0.86      0.85      0.85        82
weighted avg       0.86      0.85      0.85        82



NB with TfIdf
              precision    recall  f1-score   support

           0       0.88      0.57      0.70        40
           1       0.70      0.93      0.80        42

    accuracy                           0.76        82
   macro avg       0.79      0.75      0.75        82
weighted avg       0.79      0.76      0.75        82



LogReg with Count
              precision    recall  f1-score   support

           0       0.76      0.88      0.81        40
           1       0.86      0.74      0.79        42

    accuracy                           0.80        82
   macro avg       0.81      0.81      0.80        82
weighted avg       0.81 

In [10]:
pipeline = Pipeline(steps = [('vectorizer', TfidfVectorizer()), ('classifier',LogisticRegression(max_iter=1000))])
grid = {
    'vectorizer__ngram_range' : [(1,1), (1,2)],
    'vectorizer__max_df': [0.75, 1.0],
    'vectorizer__min_df': [0, 2, 5],
    'classifier__C':[1,2.5,5,10],
    'classifier__solver':['lbfgs','sag','saga']
}

In [11]:
search = GridSearchCV(estimator=pipeline,
                      param_grid=grid,
                      scoring='accuracy',
                      cv=3,
                      n_jobs=-1,
                      verbose=10)
search.fit(news_train, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Using these hyperparameters {'classifier__C': 2.5, 'classifier__solver': 'saga', 'vectorizer__max_df': 1.0, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 1)}, we get the best performance:


In [12]:
vectorizer = TfidfVectorizer(min_df=2, max_df=1.0,ngram_range = (1,1))
X_train = vectorizer.fit_transform(news_train)
X_val = vectorizer.transform(news_val)

In [13]:
lg = LogisticRegression(C=2.5,solver='saga',max_iter=1000)
lg.fit(X_train, y_train)
y_pred = lg.predict(X_val)
print(metrics.classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        40
           1       0.90      0.88      0.89        42

    accuracy                           0.89        82
   macro avg       0.89      0.89      0.89        82
weighted avg       0.89      0.89      0.89        82


In [14]:
vectorizer = TfidfVectorizer(min_df=2, max_df=1.0,ngram_range = (1,1))
X_train = vectorizer.fit_transform(news_train)
X_test = vectorizer.transform(news_test)

In [15]:
lg = LogisticRegression(C=2.5,solver='saga',max_iter=1000)
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.82      0.85        38
           1       0.81      0.88      0.85        34

    accuracy                           0.85        72
   macro avg       0.85      0.85      0.85        72
weighted avg       0.85      0.85      0.85        72


### Making predictions for the final dataset

In [16]:
pred = pd.read_csv("data_dedup.csv", header = 0, delimiter = ",")
pred.drop(labels = ["Unnamed: 0"], axis = 1, inplace = True)

In [17]:
news_pred = pred["text_clean"]

In [18]:
len(news_pred)

118608

In [19]:
X_train = vectorizer.fit_transform(news_train)
X_pred = vectorizer.transform(news_pred)

In [20]:
lg.fit(X_train, y_train)
y_pred = lg.predict(X_pred)

In [21]:
pred["keep"] = y_pred

In [22]:
print(pred["keep"].value_counts())

1    59541
0    59067
Name: keep, dtype: int64


In [23]:
pre = pred.loc[pred["keep"]==1]

In [24]:
len(pre)

59541

In [25]:
pre.to_csv("3_SML_data_final.csv")