In [None]:
!pip install  -Uqq contractions # used to rephrase sentences like "he'll, she's not" to "he will, she is not"

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import regexp_tokenize, word_tokenize
from textblob import TextBlob, Word
import nltk
import re
# import contractions
import os
import pickle
import spacy

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        if 'train' in path:
            train_path = path
        elif 'test' in path:
            test_path = path

print("train:", train_path)
print("test:", test_path)

### Helper Functions

In [None]:
def clean(text):
    # ADDED: rephrase contractions
    text = contractions.fix(text)
    # remove punctuation
    text = re.sub(r'[^\w+\s]', '', text)
    # remove urls
    text = re.sub(r'https?:\S+','',text)
    # remove numbers
    text = re.sub(r'\d', '', text)
    # remove emails
    text = re.sub(r'\S+@\S+','',text)
    return text.lower()

In [None]:
"""
for some reason idk lemmatizer of WordNetLemmatizer() doesn't work
"""
# nltk.download('all-nltk')
# lemmatizer = nltk.stem.WordNetLemmatizer()
# stop_words = set(nltk.corpus.stopwords.words('english'))

# def remove_stopwords(text):
#     tokens = nltk.tokenize.word_tokenize(text)
#     return ' '.join([word for word in tokens if word not in stop_words])

# def lemmatize_sentence(text):
#     tokens = nltk.tokenize.word_tokenize(text)
#     return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

# def clean_with_standardization(text):
#     # remove punctuation
#     text = re.sub(r'[^\w+\s]', '', text)
#     # remove urls
#     text = re.sub(r'https?:\S+','',text)
#     # remove numbers
#     text = re.sub(r'\d', '', text)
#     # remove emails
#     text = re.sub(r'\S+@\S+','',text)
#     # rephrase contractions
#     text = contractions.fix(text)
#     # convert to lower case
#     text = text.lower()
#     # lemmatize to normal form
#     text = lemmatize_sentence(text)
#     return text

# def clean_with_standardization_stopwords(text):
#     # remove punctuation
#     text = re.sub(r'[^\w+\s]', '', text)
#     # remove urls
#     text = re.sub(r'https?:\S+','',text)
#     # remove numbers
#     text = re.sub(r'\d', '', text)
#     # remove emails
#     text = re.sub(r'\S+@\S+','',text)
#     # rephrase contractions
#     text = contractions.fix(text)
#     # convert to lower case
#     text = text.lower()
#     #remove stop words
#     text = remove_stopwords(text)
#     # lemmatize to normal form
#     text = lemmatize_sentence(text)
#     return text

In [None]:
# this approach takes more time but i feel like its easier and more general
# it has a loop and join
nlp = spacy.load('en_core_web_sm')
def remove_extra_whitespaces(text):
    return  " ".join(text.split())

def spacy_clean(text):
    text = text.lower()
    # remove urls
    text = re.sub(r'https?:\S+','',text)
    doc = nlp(text)
    tokenized_sentence = []
    for token in doc:
#         if not token.is_stop and not token.is_punct and not token.is_space and token.is_alpha and not token.like_email:
        if  not token.is_punct and not token.is_space and token.is_alpha and not token.like_email:
            tokenized_sentence.append(token.lemma_.lower())
        
    return ' '.join(tokenized_sentence)

# print(clean("Hey, This isn't a         sentence."))
print(spacy_clean("Hey, This isn't a       Busness  sentence."))

In [None]:
def fine_tune_hyperparameters(model, search_grid, X_train, y_train, score = 'f1_weighted', verbose=0):
    gs = GridSearchCV(model, search_grid, scoring=score, refit='f1_weighted', verbose=verbose)
    gs.fit(X_train, y_train)
    print(f'Best score: {gs.best_score_} with param: {gs.best_params_}')
    return gs.best_estimator_

In [None]:
def save_model_weights(model, filename):
    path = '/kaggle/working/' + filename
    with open(path, 'wb') as file:
        pickle.dump(model, file)

## EDA

In [None]:
df = pd.read_csv(train_path)

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.isna().sum()

there is only one null in the data, so we can drop it

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe(include=object)

In [None]:
(df['sentiment'].value_counts() / df['sentiment'].shape[0]) * 100

In [None]:
df.loc[:6,'text']

## Preprocessing

Clean and drop unnecessary columns

In [None]:
df['clean_text'] = df['text'].apply(spacy_clean)
df.drop(columns=['textID'], inplace=True)
df.columns

In [None]:
df.head()

In [None]:
# u can use LabelEncoder here btw, like that:
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['sentiment'])
df.head()

### Split the data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['target'], test_size=0.2, random_state=42)

### Vectorize the text data

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

## Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
search_grid = {
    'alpha':[0.001, 0.01, 0.1, 1]
}
nb_model = fine_tune_hyperparameters(nb_model, search_grid, tfidf_train, y_train)

In [None]:
save_model_weights(nb_model, 'nb_twitter.pkl')

In [None]:
nb_pred = nb_model.predict(tfidf_test)
print(classification_report(y_test, nb_pred, target_names=['negative', 'neutral', 'positive']))

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=2000)

grid = {'C':[0.001, 0.01, 0.1, 1],
        'solver':['liblinear','lbfgs']
       }
# 'penalty':['l1', 'l2'] cause error because lbfgs work only with l2
log_reg = fine_tune_hyperparameters(log_reg, grid, tfidf_train, y_train)

In [None]:
save_model_weights(log_reg, 'lg_twitter.pkl')

In [None]:
log_pred = log_reg.predict(tfidf_test)
print(classification_report(y_test, log_pred, target_names=['negative', 'neutral', 'positive']))

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
search_grid = {
    'max_depth': [4,8,16,32,64,128],
    'min_samples_split': [4,8,16,32,64,128],
}

tree_model = fine_tune_hyperparameters(tree_model, search_grid, tfidf_train, y_train,score='f1_weighted')

In [None]:
save_model_weights(tree_model, 'dt_twitter.pkl')

In [None]:
tree_pred = tree_model.predict(tfidf_test)
print(classification_report(y_test, tree_pred, target_names=['negative', 'neutral', 'positive']))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
search_grid = {
    'max_depth': [32,64,128],
    'min_samples_split': [32,64,128],
}
rf_model = fine_tune_hyperparameters(rf_model, search_grid, tfidf_train, y_train,score='f1_weighted')

In [None]:
save_model_weights(rf_model, 'rf_twitter.pkl')

In [None]:
rf_pred = rf_model.predict(tfidf_test)
print(classification_report(y_test, rf_pred, target_names=['negative', 'neutral', 'positive']))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
search_grid = {
    'n_neighbors': list(range(1, 31)),
    'weights':['uniform','distance'],
    'p':[1,2]
}
knn_model = fine_tune_hyperparameters(knn_model, search_grid, tfidf_train, y_train)

In [None]:
save_model_weights(knn_model, 'knn_twitter.pkl')

In [None]:
knn_pred = knn_model.predict(tfidf_test)
print(classification_report(y_test, knn_pred, target_names=['negative', 'neutral', 'positive']))

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [None]:
data = df.drop(columns=['text', 'selected_text', 'sentiment'])
data = data.rename(columns={'clean_text':'content'})
data.head()

In [None]:
train, test = train_test_split(data, test_size=0.1, random_state=44)

print('Train dataset shape: {}'.format(train.shape))
print('Test dataset shape: {}'.format(test.shape))
X_train = train['content']
X_test = test['content']
y_train = train['target']
y_test = test['target']


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_vocab = 20000
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X_train)
word_idx = tokenizer.word_index  # Corrected syntax for accessing word index
vocab_len = len(word_idx)
print("the size of vocab =", vocab_len)  # Corrected spacing
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 100
X_train = pad_sequences(X_train,maxlen=maxlen)
X_test = pad_sequences(X_test,maxlen=maxlen)
train['target'].value_counts()

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense,Input,GlobalMaxPooling1D,Dropout,Bidirectional
from tensorflow.keras.models import Model
from keras import optimizers

dims=100
learning_rate = 0.0001  # Set your desired learning rate here

model = tf.keras.Sequential([
    Input(shape=(maxlen,)),
    Embedding(vocab_len + 1, dims),
    Dropout(0.5),
    Bidirectional(LSTM(150)),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

# Define optimizer with specified learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=40, validation_data=(X_test, y_test))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
cm = confusion_matrix(y_test, y_pred_labels)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.show()