In [1]:
import pandas as pd

### Data Preprocessing

##### Get data

Data: Twitter sentiment analysis data from [Kaggle](https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset), with 3 classes; Negative, Positive, Neutral

In [2]:
df = pd.concat(
    (pd.read_csv('data/train.csv'),
     pd.read_csv('data/test.csv')
     )
    )[['textID', 'text', 'selected_text', 'sentiment']]

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 32296 entries, 0 to 4814
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         31015 non-null  object
 1   text           31014 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      31015 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB
None


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
df = df.dropna()
df.shape

(27480, 4)

In [4]:
# checking for duplicates
df['textID'].duplicated().sum()

0

In [5]:
# label encoding
df['sentiment'] = df['sentiment'].map({'neutral': 0, 'positive': 1, 'negative': -1})
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,-1
2,088c60f138,my boss is bullying me...,bullying me,-1
3,9642c003ef,what interview! leave me alone,leave me alone,-1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",-1


In [6]:
df['text'] = df['text'].astype('string')
df['sentiment'] = df['sentiment'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27480 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   textID         27480 non-null  object  
 1   text           27480 non-null  string  
 2   selected_text  27480 non-null  object  
 3   sentiment      27480 non-null  category
dtypes: category(1), object(2), string(1)
memory usage: 885.7+ KB


##### Data Cleaning

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer


# Download required NLTK resources
nltk.download('stopwords')

# Stop words set
stop_words = set(stopwords.words('english'))

# Initialize the Lancaster Stemmer
lancaster_stemmer = LancasterStemmer()

# Regex to clean mentions, URLs, and non-alphanumeric characters
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9\s]+"

# Function to clean the text, including removing stopwords, stemming, etc.
def clean_text(text):
    # Step 1: Remove HTML tags
    text = re.sub(r'<.*?>', '', str(text))
    
    # Step 2: Apply the custom text cleaning regex (for mentions, URLs, special characters)
    text = re.sub(TEXT_CLEANING_RE, ' ', text)
    
    # Step 3: Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Step 4: Lowercase the text
    text = text.lower()
    
    # Step 5: Tokenize the text and apply stemming while removing stopwords
    text = " ".join([lancaster_stemmer.stem(word) for word in text.split() if word not in stop_words])
    
    # Step 6: Remove extra spaces and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Applying the cleaning function with Lancaster stemming
df['text'] = df['text'].apply(clean_text)


df.head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vange\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,respond going,"I`d have responded, if I were going",0
1,549e992a42,sooo sad miss san diego,Sooo SAD,-1
2,088c60f138,boss bul,bullying me,-1
3,9642c003ef,interview leav alon,leave me alone,-1
4,358bd9e861,son put releas already bought,"Sons of ****,",-1


##### Text Tokenization

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)

##### Logistic Regression (One VS Rest)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])

# Define the parameter grid
param_grid = {
    'tfidf__max_features': [3000, 5000, 7000],  
    'clf__estimator__C': [0.1, 1, 10],  # Regularization strength for LogisticRegression
    'clf__estimator__solver': ['liblinear', 'lbfgs']  # Solvers to use 
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters found:  {'clf__estimator__C': 1, 'clf__estimator__solver': 'lbfgs', 'tfidf__max_features': 3000}
              precision    recall  f1-score   support

          -1       0.71      0.57      0.63      1572
           0       0.63      0.75      0.69      2236
           1       0.76      0.71      0.74      1688

    accuracy                           0.69      5496
   macro avg       0.70      0.68      0.68      5496
weighted avg       0.69      0.69      0.69      5496



##### Random Forest Classifier (One VS Rest)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(RandomForestClassifier()))
])

# Define the parameter grid for RandomForestClassifier
param_grid = {
    'tfidf__max_features': [3000, 5000, 7000],
    'clf__estimator__n_estimators': [50, 100, 200],  # Number of trees in the forest
    'clf__estimator__max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
    'clf__estimator__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'clf__estimator__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'clf__estimator__bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the best model
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))
 

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters found:  {'clf__estimator__bootstrap': False, 'clf__estimator__max_depth': None, 'clf__estimator__min_samples_leaf': 4, 'clf__estimator__min_samples_split': 2, 'clf__estimator__n_estimators': 100, 'tfidf__max_features': 7000}
              precision    recall  f1-score   support

          -1       0.72      0.62      0.67      1572
           0       0.68      0.69      0.69      2236
           1       0.72      0.79      0.75      1688

    accuracy                           0.70      5496
   macro avg       0.71      0.70      0.70      5496
weighted avg       0.70      0.70      0.70      5496



##### XGBoost Classifier

In [9]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit on training data and transform both train and test data
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [10]:
from xgboost import XGBClassifier

# Map the values in y to the expected classes
y_train_mapped = y_train.map({-1: 0, 0: 1, 1: 2})
y_test_mapped = y_test.map({-1: 0, 0: 1, 1: 2})

#Model Training
model = XGBClassifier(objective='multi:softmax', num_class=3, n_estimators=1000)
model.fit(X_train, y_train_mapped)

In [15]:
# Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test_mapped, y_pred, target_names={'negative': 0, 'neutral': 1, 'positive': 2}))

              precision    recall  f1-score   support

    negative       0.69      0.59      0.64      1572
     neutral       0.64      0.73      0.68      2236
    positive       0.76      0.72      0.74      1688

    accuracy                           0.69      5496
   macro avg       0.70      0.68      0.69      5496
weighted avg       0.69      0.69      0.69      5496

