In [1]:
from sklearn import metrics
import numpy as np 
import pandas as pd 
import re, string, nltk
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df=pd.read_csv("/kaggle/input/twitter-airline-sentiment/Tweets.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
#Filtering data with high confidence for better model learning
df0=df[df["airline_sentiment_confidence"]>0.65]
df0.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
5,570300767074181121,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,2015-02-24 11:14:33 -0800,,Pacific Time (US & Canada)


In [4]:
from sklearn.model_selection import train_test_split
#Splitting into training and testing set
train, test= train_test_split(df0, test_size=0.2, random_state=1000)
train_x=train['text'].values
test_x=test['text'].values
train_y=train['airline_sentiment']
test_y=test['airline_sentiment']

In [5]:
#import english stopwords
import re, string, nltk
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
stopword_list=nltk.corpus.stopwords.words('english')

def tokenize(text):
    tknzr=TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
    return tknzr.tokenize(text)

def remove_stopwords(text):
    tokens=tokenize(text)
    filtered_tokens=[token for token in tokens if token not in stopword_list]
    filtered_text=' '.join(filtered_tokens)
    return filtered_text

def normalize_corpus(corpus):
    normalized_corpus=[]
    
    for index, text in enumerate(corpus):
        text=text.lower()
        text=remove_stopwords(text)
        normalized_corpus.append(text)
    
    return normalized_corpus

In [6]:
#normalization
norm_train=normalize_corpus(train_x)
#feature extraction
vectorizer=CountVectorizer(ngram_range=(1,2),tokenizer=tokenize)
train_features=vectorizer.fit_transform(norm_train).astype(float)

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Creating the SGDClassifier model
svm = SGDClassifier()

# Defining the hyperparameter grid to search over
param_grid = {
    'alpha': [0.001, 0.01, 0.1],
    'max_iter': [10, 100, 1000],
}

#StratifiedKFold object
skf = StratifiedKFold(n_splits=100)

# Create the GridSearchCV object with StratifiedKFold
grid_search = GridSearchCV(svm, param_grid, cv=skf)

# Fitting the GridSearchCV object to the training data
grid_search.fit(train_features, train_y)

# Printing the best hyperparameters and accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy score:", grid_search.best_score_)




Best hyperparameters: {'alpha': 0.001, 'max_iter': 1000}
Best accuracy score: 0.8215204336947457


In [8]:
norm_test = normalize_corpus(test_x)  
test_features = vectorizer.transform(norm_test)
grid_search.score(test_features, test_y)

0.8121567191504944

In [9]:
predicted_sentiments = grid_search.predict(test_features)

In [10]:
#Printing evaluation mesures report
report = metrics.classification_report(y_true=test_y, 
                                           y_pred=predicted_sentiments, 
                                           labels=['positive', 'neutral', 'negative'])
print(report)

              precision    recall  f1-score   support

    positive       0.79      0.72      0.76       445
     neutral       0.67      0.53      0.59       535
    negative       0.85      0.92      0.88      1751

    accuracy                           0.81      2731
   macro avg       0.77      0.72      0.74      2731
weighted avg       0.80      0.81      0.81      2731



In [11]:
#Saving the model
import pickle
with open('model.pickle', 'wb') as file:
    pickle.dump((vectorizer, grid_search), file)