In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from string import punctuation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
train = pd.read_csv("training_twitter_x_y_train.csv")
print(train.shape)
train.head()

(10980, 12)


Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [4]:
drop_cols = ['airline_sentiment_gold','name','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location']
train.drop(drop_cols, axis=1, inplace=True)
train.head()

Unnamed: 0,airline_sentiment,airline,negativereason_gold,text
0,negative,Southwest,,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,Southwest,,@SouthwestAir seeing your workers time in and ...
2,positive,United,,@united Flew ORD to Miami and back and had gr...
3,negative,Southwest,,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,United,,@united so our flight into ORD was delayed bec...


In [5]:
stops = set(stopwords.words('english'))
stops.update(set(punctuation))
stops.update(['flight', 'airline', 'flights', 'AA'])

In [6]:
abbreviations = {
    'ppl': 'people', 'cust': 'customer', 'serv': 'service',
    'mins': 'minutes', 'hrs': 'hours', 'svc': 'service',
    'u': 'you', 'pls': 'please'
}

In [7]:
def preprocess_text(text):
    # Remove links
    text = re.sub('((www\\.[^\\s]+)|(https?://[^\\s]+))', '', text)
    # Remove usernames
    text = re.sub('@[^\\s]+', '', text)
    # Remove additional whitespaces
    text = re.sub('[\\s]+', ' ', text)
    # Replace #word with word
    text = re.sub(r'#([^\\s]+)', r'\\1', text)
    # Trim tweet
    text = text.strip('\'\"')

    words = []
    for word in text.split():
        word = word.lower()
        if word not in stops:
            word = abbreviations.get(word, word)
            if not any(char.isdigit() for char in word):  # Remove words with numbers
                words.append(word)
                
    return ' '.join(words)

In [8]:
train['processed_text'] = train['text'].apply(preprocess_text)

In [9]:
train['processed_text'] = train.apply(lambda row: f"{row['processed_text']} {row['airline']}", axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(train['processed_text'], train['airline_sentiment'], test_size=0.2, random_state=42)

In [11]:
vectorizer = TfidfVectorizer(max_features=3150, max_df=0.8)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [12]:
print("Training set shape:", X_train_features.shape)
print("Testing set shape:", X_test_features.shape)

Training set shape: (8784, 3150)
Testing set shape: (2196, 3150)


In [13]:
def grid_search_and_evaluate(model, param_grid, X_train, y_train, X_test, y_test, model_name):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    print(f"Best parameters for {model_name}: {best_params}")
    predictions = best_model.predict(X_test)
    
    print(f"\n{model_name} Results:")
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\n" + "="*50 + "\n")


In [14]:
lr_param_grid = {
    'C': [0.1, 0.5, 1.0, 2.1, 5.0],
    'solver': ['liblinear', 'lbfgs'],
    'multi_class': ['auto', 'ovr']
}

In [15]:
lr_model = LogisticRegression(random_state=42)

In [16]:
grid_search_and_evaluate(lr_model, lr_param_grid, X_train_features, y_train, X_test_features, y_test, "Logistic Regression")

Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best parameters for Logistic Regression: {'C': 5.0, 'multi_class': 'ovr', 'solver': 'lbfgs'}

Logistic Regression Results:
Accuracy: 0.7800546448087432

Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.92      0.86      1356
     neutral       0.63      0.50      0.56       458
    positive       0.78      0.62      0.69       382

    accuracy                           0.78      2196
   macro avg       0.74      0.68      0.71      2196
weighted avg       0.77      0.78      0.77      2196


Confusion Matrix:
[[1247   83   26]
 [ 190  229   39]
 [  95   50  237]]




In [17]:
svm_param_grid = {
    'C': [0.1, 0.5, 1.0, 0.96, 2.0],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [18]:
svm_model = SVC(random_state=42)

In [19]:
grid_search_and_evaluate(svm_model, svm_param_grid, X_train_features, y_train, X_test_features, y_test, "SVM")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters for SVM: {'C': 2.0, 'gamma': 'scale', 'kernel': 'rbf'}

SVM Results:
Accuracy: 0.7750455373406193

Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.92      0.86      1356
     neutral       0.63      0.48      0.54       458
    positive       0.78      0.61      0.69       382

    accuracy                           0.78      2196
   macro avg       0.74      0.67      0.70      2196
weighted avg       0.77      0.78      0.76      2196


Confusion Matrix:
[[1248   82   26]
 [ 199  220   39]
 [ 100   48  234]]


