<a href="https://colab.research.google.com/github/GusMalija/Master-Thesis-Project-Augustine-Malija/blob/main/experimentation_baseline_climate_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#calling important libraries
import pandas as pd #for data manipulation
import numpy as np #for data manipulation
import matplotlib as plt #for plotting
import os #for ease of python system interaction
import sys
import re
import nltk
nltk.download("stopwords")
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#retrieving the data
url = "https://raw.githubusercontent.com/GusMalija/Master-Thesis-Project-Augustine-Malija/main/Data/twitter_sentiment_data.csv"

labeled_data = pd.read_csv(url)
#checking the dataset features
labeled_data.keys()
labeled_data.head()

#randomly selecting 2000 data points
labeled_data = labeled_data.sample(n = 20000)

In [None]:
#extracting only tweets as features
features = labeled_data.iloc[:,1].values
#extracting labels
stance_labels = labeled_data.iloc[:,0].values

In [None]:
#preprocessing tweets
processed_features = []

for sentence in range(0, len(features)):
    #Removing special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    #removing single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    #Removing single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    #Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    #Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    #removing @
    processed_feature = re.sub(r'@[\w]+','',processed_feature)
    #removing numbers
    processed_feature = re.sub(r'[0-9]+', '', processed_feature)
    # Removing hashtags
    processed_feature = re.sub(r'#\w*', ' ', processed_feature)
    #removing url
    processed_feature = re.sub(r"http\S+", "", processed_feature)
    #Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [None]:
#specifying parameters
parameters = [
    {
    'vect__max_df': (0.9,), #ignore terms with frequency higher than aforementioned
        'vect__min_df': (2,), #ignore lower frequencies than aforementioned
        'vect__ngram_range': ((1, 2),), #unigrams and bigrams
        'clf__estimator__kernel': ['rbf'], #gausian kernel
    'clf__estimator__gamma': [1e0,], #a gamma of zero
        'clf__estimator__C': [1,],
        'clf__estimator__class_weight': [None, "balanced"] #balanced weight
    } ,
    {
        'vect__max_df': (0.9,),
        'vect__min_df': (2,),
        'vect__ngram_range': ((1, 2),),
        'clf__estimator__kernel': ['linear'], #linear kernel
    'clf__estimator__C': [1,]
    }
]

In [None]:
# building a pipeline
pipeline = Pipeline([('vect', TfidfVectorizer()),   #max_features = 2500, stop_words=stopwords.words("english")                  
    ('clf', OneVsRestClassifier(SVC(probability=True))),
])

In [None]:
#splitting the dataset to trian and test set
#80 percent of data for training, 20 percent for testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, stance_labels, test_size=0.2, random_state=42)

In [None]:
#grid search
grid_search = GridSearchCV(pipeline, param_grid=parameters, n_jobs=-1, verbose=1)

In [None]:
#fitting the classifier
classifier = grid_search.fit(X_train, y_train)

#predicting
y_predict = classifier.predict(X_test)

#2000, accuracy 64
#5000, accuracy 65
#6000, acuracy 67
#10000, accuracy 70
#15000, accuracy 72
#20000, accuracy 73

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 181.5min finished


In [None]:
#confusion matrix
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[ 136   84  122   26]
 [  24  318  276   63]
 [  22  143 1788  169]
 [   6   15  149  659]]
              precision    recall  f1-score   support

          -1       0.72      0.37      0.49       368
           0       0.57      0.47      0.51       681
           1       0.77      0.84      0.80      2122
           2       0.72      0.79      0.75       829

    accuracy                           0.73      4000
   macro avg       0.69      0.62      0.64      4000
weighted avg       0.72      0.73      0.71      4000

