## Imports

In [173]:
import opendatasets as od
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from nltk.tokenize import regexp_tokenize, word_tokenize
import nltk
import re
import os

In [148]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        if 'train' in path:
            train_path = path
        elif 'test' in path:
            test_path = path

print("train:", train_path)
print("test:", test_path)

train: /kaggle/input/tweet-sentiment-analysis/tweet-sentiment-analysis/train.csv
test: /kaggle/input/tweet-sentiment-analysis/tweet-sentiment-analysis/test.csv


### Helper Functions

In [157]:
def clean(text):
    # remove punctuation
    text = re.sub(r'[^\w+\s]', '', text)
    # remove urls
    text = re.sub(r'https?:\S+','',text)
    # remove numbers
    text = re.sub(r'\d', '', text)
    # remove emails
    text = re.sub(r'\S+@\S+','',text)
    return text

In [191]:
def fine_tune_hyperparameters(model, search_grid, X_train, y_train, score = 'f1_weighted', verbose=0):
    gs = GridSearchCV(model, search_grid, scoring=score, refit='f1_weighted', verbose=verbose)
    gs.fit(X_train, y_train)
    print(f'Best score: {gs.best_score_} with param: {gs.best_params_}')
    return gs.best_estimator_, gs.cv_results_

## EDA

In [149]:
df = pd.read_csv(train_path)

In [150]:
df.head(10)

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameles...,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is a...,positive
7,50e14c0bb8,Soooo high,neutral
8,e050245fbd,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe....,positive


In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   textID     27481 non-null  object
 1   text       27480 non-null  object
 2   sentiment  27481 non-null  object
dtypes: object(3)
memory usage: 644.2+ KB


In [152]:
df.isna().sum()

textID       0
text         1
sentiment    0
dtype: int64

there is only one null in the data, so we can drop it

In [153]:
df.dropna(inplace=True)

In [154]:
df.describe(include=object)

Unnamed: 0,textID,text,sentiment
count,27480,27480,27480
unique,27480,27480,3
top,cb774db0d1,"I`d have responded, if I were going",neutral
freq,1,1,11117


In [155]:
(df['sentiment'].value_counts() / df['sentiment'].shape[0]) * 100

sentiment
neutral     40.454876
positive    31.229985
negative    28.315138
Name: count, dtype: float64

In [156]:
df.loc[:6,'text']

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
5    http://www.dothebouncy.com/smf - some shameles...
6    2am feedings for the baby are fun when he is a...
Name: text, dtype: object

## Preprocessing

Clean and drop unnecessary columns

In [158]:
df['clean_text'] = df['text'].apply(clean)
df.drop(columns=['textID'], inplace=True)
df.columns

Index(['text', 'sentiment', 'clean_text'], dtype='object')

In [159]:
encoder = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['sentiment'].apply(lambda x: encoder[x])
df.head()

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",1,Id have responded if I were going
1,Sooo SAD I will miss you here in San Diego!!!,0,Sooo SAD I will miss you here in San Diego
2,my boss is bullying me...,0,my boss is bullying me
3,what interview! leave me alone,0,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...",0,Sons of why couldnt they put them on the rel...


### Split the data into train and test

In [160]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['sentiment'], test_size=0.2)

### Vectorize the text data

In [161]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

## Models

In [162]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(tfidf_train, y_train)
logestic_pred = nb_model.predict(tfidf_test)
accuracy_score(y_test, logestic_pred)

0.620269286754003

In [163]:
from sklearn.linear_model import LogisticRegression
logestic_model = LogisticRegression(max_iter=1000)
logestic_model.fit(tfidf_train, y_train)
logestic_pred = logestic_model.predict(tfidf_test)
accuracy_score(y_test, logestic_pred)

0.6945050946142649

In [None]:
def plot_cv_scores(cv_results):

In [190]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
search_grid = {
    'max_depth': [4,8,16,32,64,128],
    'min_samples_split': [4,8,16,32,64,128],
}

tree_model, tree_cv_scores = fine_tune_hyperparameters(tree_model, search_grid, tfidf_train, y_train,score='f1_weighted')
tree_pred = tree_model.predict(tfidf_test)
f1_score(y_test, tree_pred, average='weighted')

Best score: 0.6626990790988165 with param: {'max_depth': 128, 'min_samples_split': 128}


In [193]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
search_grid = {
    'max_depth': [4,8,16,32,64,128],
    'min_samples_split': [4,8,16,32,64,128],
}
rf_model, rf_cv_scores = fine_tune_hyperparameters(rf_model, search_grid, tfidf_train, y_train,score='f1_weighted')
rf_pred = rf_model.predict(tfidf_test)
accuracy_score(y_test, rf_pred)


KeyboardInterrupt



In [166]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(tfidf_train, y_train)
knn_pred = knn_model.predict(tfidf_test)
accuracy_score(y_test, knn_pred)

0.47652838427947597