<a href="https://colab.research.google.com/github/GusMalija/Master-Thesis-Project-Augustine-Malija/blob/main/Other_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#calling important libraries
import pandas as pd #for data manipulation
import numpy as np #for data manipulation
import matplotlib as plt #for plotting
import os #for ease of python system interaction
import sys
import re
import nltk
nltk.download("stopwords")
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#retrieving the data
url = "https://raw.githubusercontent.com/GusMalija/Master-Thesis-Project-Augustine-Malija/main/Data/twitter_sentiment_data.csv"

labeled_data = pd.read_csv(url)
#checking the dataset features
labeled_data.keys()
labeled_data.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [8]:
#extracting only tweets as features
features = labeled_data.iloc[:,1].values
#extracting labels
all_labels = labeled_data.iloc[:,0].values

In [9]:
#preprocessing tweets
processed_features = []

for sentence in range(0, len(features)):
    #Removing special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    #removing single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    #Removing single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    #Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    #Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    #Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [10]:
#specifying parameters
parameters = [
    {
    'vect__max_df': (0.9,), #ignore terms with frequency higher than aforementioned
        'vect__min_df': (2,), #ignore lower frequencies than aforementioned
        'vect__ngram_range': ((1, 1),), #only unigrams
        'clf__estimator__kernel': ['rbf'], #gausian kernel
    'clf__estimator__gamma': [1e0,], #a gamma of zero
        'clf__estimator__C': [1,],
        'clf__estimator__class_weight': [None, "balanced"] #balanced weight
    },
    {
        'vect__max_df': (0.9,),
        'vect__min_df': (2,),
        'vect__ngram_range': ((1, 1),),
        'clf__estimator__kernel': ['linear'], #linear kernel
    'clf__estimator__C': [1,]
    }
]

In [11]:
# building a pipeline
pipeline = Pipeline([('vect', TfidfVectorizer(parameters)),                     
    ('clf', OneVsRestClassifier(SVC(probability=True))),
])

In [12]:
#splitting the dataset to trian and test set
#80 percent of data for training, 20 percent for testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, all_labels, test_size=0.2, random_state=42)

In [13]:
#grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

In [None]:
#fitting the classifier
classifier = grid_search.fit(X_train, y_train)

#predicting
y_predict = classifier.predict(X_test)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
#confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

In [None]:
#evaluating the model with ROC AUC score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, classifier.predict_proba(X_test), multi_class="ovr")

#with a higher score than 0.5, signifies that our model is useful.

In [None]:
#evaluating the model using cross validation
from sklearn.model_selection import cross_val_score
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)

#returning accuracies of the folds
print(all_accuracies)

In [None]:
#printing standard deviation of accuracies
print(all_accuracies.std())

#earning a low variance meaning; a good indicator that the model will perform similar on all test sets and the prediction obtained is not by chance