In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from nltk.tokenize import TweetTokenizer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
def perform_cv(model, X, y):
    f1_scores = []
    X_t, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    model.fit(X_t, y_train)
    model_pred = model.predict(X_test)
    print(classification_report(y_test, model_pred))
    f1 = f1_score(y_test, model_pred,average="weighted")
    print("F1 score: "+str(f1))
    f1_scores.append(f1)
    return f1_scores

In [3]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

In [4]:
processed_train = pd.read_csv('data/processed_data.csv')
processed_train.dropna(inplace=True)

In [51]:
processed_train = pd.read_csv('data/train_lemmatized.csv')
processed_train.dropna(inplace=True)

In [5]:
processed_train['sentiment'].value_counts()

1    4309
2    2382
0     456
3     125
Name: sentiment, dtype: int64

In [6]:
test_data = pd.read_csv('data/test_processed.csv')

In [53]:
test_data = pd.read_csv('data/test_lemmatized.csv')

In [7]:
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(processed_train['tweet'])

In [66]:
tf = TfidfVectorizer()
X_train = tf.fit_transform(processed_train['tweet'])

In [18]:
X_train.shape

(7272, 8332)

In [19]:
X_test = count_vectorizer.transform(test_data['tweet'])

In [20]:
X_test.shape

(1819, 8332)

In [21]:
ros = RandomOverSampler(random_state=3)
data_res, label_res = ros.fit_sample(X_train, processed_train['sentiment'])

In [22]:
data_res.shape

(17236, 8332)

In [23]:
label_res.shape

(17236,)

In [16]:
lr_model = LogisticRegression(max_iter=7600)

In [26]:
xgb = XGBClassifier(max_depth=7, n_estimators=200)

In [72]:
gnb = GaussianNB()

In [None]:
svc = SVC()

In [27]:
f1_scores = perform_cv(xgb, data_res, label_res)

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1425
           1       0.73      0.69      0.71      1428
           2       0.75      0.71      0.73      1387
           3       0.96      1.00      0.98      1448

    accuracy                           0.84      5688
   macro avg       0.84      0.84      0.84      5688
weighted avg       0.84      0.84      0.84      5688

F1 score: 0.8396630326323239


In [230]:
np.mean(f1_scores)

0.6491923836728479

In [248]:
f1_scores = perform_cv(lr_model, data_res, label_res)

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1412
           1       0.78      0.72      0.75      1450
           2       0.77      0.75      0.76      1426
           3       0.96      0.99      0.98      1400

    accuracy                           0.86      5688
   macro avg       0.86      0.86      0.86      5688
weighted avg       0.85      0.86      0.86      5688

F1 score: 0.8555155603033034
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      1425
           1       0.81      0.67      0.73      1479
           2       0.74      0.79      0.77      1380
           3       0.94      1.00      0.97      1404

    accuracy                           0.86      5688
   macro avg       0.85      0.86      0.85      5688
weighted avg       0.85      0.86      0.85      5688

F1 score: 0.8539231156849446
              precision    recall  f1-score   support

           0     

In [252]:
lr_model.fit(data_res, label_res)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=7600,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [253]:
upsample_pred = lr_model.predict(X_test)

In [254]:
to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': upsample_pred})
to_submit.to_csv('solutions/upsample_pred.csv', index=False)
to_submit.head()

Unnamed: 0,tweet_id,sentiment
0,7506,1
1,7992,1
2,247,1
3,7688,2
4,3294,2


In [255]:
to_submit['sentiment'].value_counts()

1    1043
2     594
0     147
3      35
Name: sentiment, dtype: int64

In [477]:
probs = lr_model.predict_proba(X_test)

for i in range(0,1819):
    prob_0 = probs[i,0]
    prob_1 = probs[i,1]
    prob_2 = probs[i,2]
    prob_3 = probs[i,3]
    if prob_1 > 0.75:
        probs[i,3] = 2
    if prob_3 > 0.2:
        probs[i,3] = 2
    if prob_0 > 0.25:
        probs[i,0] = 2
    if prob_2 > 0.6:
        probs[i,2] = 2

In [478]:
probs

array([[2.11989038e-02, 8.79853530e-01, 8.40660041e-02, 2.00000000e+00],
       [1.03548014e-03, 7.78450651e-01, 5.80816510e-02, 2.00000000e+00],
       [7.75008989e-03, 5.38446380e-01, 4.47055992e-01, 6.74753845e-03],
       ...,
       [1.16471413e-02, 3.40044767e-01, 2.00000000e+00, 6.83330723e-03],
       [2.09987891e-01, 4.45039459e-01, 2.49126504e-01, 9.58461452e-02],
       [3.60151946e-03, 9.13358004e-01, 4.59061361e-02, 2.00000000e+00]])

In [479]:
final_class_list = []
for i in range(0,1819):
    prob_0 = probs[i,0]
    prob_1 = probs[i,1]
    prob_2 = probs[i,2]
    prob_3 = probs[i,3]
    d = {'0': prob_0, '1': prob_1, '2': prob_2, '3': prob_3}
    max_class = max(d, key=d.get)
    final_class_list.append(int(max_class))

In [480]:
final_class_arr = np.asarray(final_class_list)

In [481]:
final_class_arr.shape

(1819,)

In [482]:
to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': final_class_arr})
to_submit.to_csv('probability_manipulation/pred_proba.csv', index=False)
to_submit['sentiment'].value_counts()

3    640
2    565
1    408
0    206
Name: sentiment, dtype: int64