### Import necessary requirements

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

### Load the labelled data

In [2]:
df = pd.read_csv('labelled_tinubu.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,score,location,hashtags
0,0,PHOTOS: Director of the Fundraising Committee ...,neutral,0.927625,Bourdillon,BAT2023
1,1,Even Peter Obi no fit do shingbaine as preside...,neutral,0.532245,"Lagos, Nigeria",BAT2023 RenewedHope2023 GETYOURPVC
2,2,day 8. IBEJU LEKKI LG: DATE: WEDNESDAY (14/12/...,neutral,0.894279,"Lagos, Nigeria",BAT2023 Asiwaju agbadoeconomy
3,3,@user May d trip be blessed &amp; stars aligne...,positive,0.939372,,BAT2023
4,4,May d trip be blessed &amp; stars aligned towa...,positive,0.94718,,BAT2023


### Convert sentiment column to numerics

In [4]:
sentiment_names = {
                    'negative': 0,
                    'neutral': 1,
                    'positive': 2
}

df['sentiment'] = df['sentiment'].map(sentiment_names)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,score,location,hashtags
0,0,PHOTOS: Director of the Fundraising Committee ...,1,0.927625,Bourdillon,BAT2023
1,1,Even Peter Obi no fit do shingbaine as preside...,1,0.532245,"Lagos, Nigeria",BAT2023 RenewedHope2023 GETYOURPVC
2,2,day 8. IBEJU LEKKI LG: DATE: WEDNESDAY (14/12/...,1,0.894279,"Lagos, Nigeria",BAT2023 Asiwaju agbadoeconomy
3,3,@user May d trip be blessed &amp; stars aligne...,2,0.939372,,BAT2023
4,4,May d trip be blessed &amp; stars aligned towa...,2,0.94718,,BAT2023


### Select the necessary columns for model development

In [6]:
df = df[['sentiment', 'tweet']]

In [7]:
df.head()

Unnamed: 0,sentiment,tweet
0,1,PHOTOS: Director of the Fundraising Committee ...
1,1,Even Peter Obi no fit do shingbaine as preside...
2,1,day 8. IBEJU LEKKI LG: DATE: WEDNESDAY (14/12/...
3,2,@user May d trip be blessed &amp; stars aligne...
4,2,May d trip be blessed &amp; stars aligned towa...


### Data cleaning / preprocessing

In [8]:
def remove_user(text):
    text = re.sub(r'@[A-Za-z0-9^\w]+', '', str(text))
    return text

def remove_link(text):
    text = re.sub(r'http', '', str(text))
    return text    

In [9]:
df['tweet'] = df['tweet'].apply(remove_user)
df['tweet'] = df['tweet'].apply(remove_link)

In [10]:
[i for i in df.tweet.head()]

['PHOTOS: Director of the Fundraising Committee of the PCC, and Vice Presidential Candidate, Sen. Kashim Shettima, the Lagos State Governor, Babajide Sanwo-Olu unveil the Crowdfunding App for the APC Presidential Campaign 2023 at the Civic Centre, Victoria Island, Lagos.  ',
 'Even Peter Obi no fit do shingbaine as president ask me why, him first 4yrs him go repeat the process of trying to stabilize... As Buhari don Stabilized make  start work  ',
 'day 8. IBEJU LEKKI LG: DATE: WEDNESDAY (14/12/2022) VENUE: SUPPLIER HALL, ELEKO JUNCTION, OPPOSITE TOTAL PETROL STATION, IBEJU LEKKI, LAGOS.   ',
 ' May d trip be blessed &amp; stars aligned towards d resounding victory from Feb 25, 2023, for d -  presidency project, on d  SPV. D Almighty willing &amp; sparing us all, d victory shall usher into 🇳🇬 an era of uncommon economic growth &amp; peace.',
 'May d trip be blessed &amp; stars aligned towards d resounding victory from Feb 25, 2023, for d -  presidency project, on d  SPV. D Almighty wil

In [11]:
word_tokens = [word_tokenize(tweet) for tweet in df.tweet]
cleaned_tokens = [[word for word in tweet if word.isalpha()] for tweet in word_tokens]
stemmed_tokens = [[PorterStemmer().stem(word) for word in tweet] for tweet in cleaned_tokens]

In [12]:
len_tokens = []

for i in range(len(stemmed_tokens)):
    len_tokens.append(len(stemmed_tokens[i]))
    
df['n_tokens'] = len_tokens

In [13]:
count = 0
for i in stemmed_tokens:
    df['tweet'][count] = ' '.join(stemmed_tokens[count])
    count += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet'][count] = ' '.join(stemmed_tokens[count])


In [14]:
[i for i in df.tweet.head()]

['photo director of the fundrais committe of the pcc and vice presidenti candid kashim shettima the lago state governor babajid unveil the crowdfund app for the apc presidenti campaign at the civic centr victoria island lago',
 'even peter obi no fit do shingbain as presid ask me whi him first him go repeat the process of tri to stabil as buhari don stabil make start work',
 'day ibeju lekki lg date wednesday venu supplier hall eleko junction opposit total petrol station ibeju lekki lago',
 'may d trip be bless amp star align toward d resound victori from feb for d presid project on d spv d almighti will amp spare us all d victori shall usher into an era of uncommon econom growth amp peac',
 'may d trip be bless amp star align toward d resound victori from feb for d presid project on d spv d almighti will amp spare us all d victori shall usher into an era of uncommon econom growth amp peac']

In [15]:
df.head()

Unnamed: 0,sentiment,tweet,n_tokens
0,1,photo director of the fundrais committe of the...,36
1,1,even peter obi no fit do shingbain as presid a...,30
2,1,day ibeju lekki lg date wednesday venu supplie...,18
3,2,may d trip be bless amp star align toward d re...,41
4,2,may d trip be bless amp star align toward d re...,41


### Create vectorizer for tweet column

In [26]:
stop_words = list(ENGLISH_STOP_WORDS)

In [27]:
vect = CountVectorizer(max_features=1000, ngram_range=(1,2), max_df=500, stop_words=stop_words)
vect.fit(df.tweet)
X = vect.transform(df.tweet)

In [28]:
X_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

In [29]:
X_df

Unnamed: 0,abdullahi,abeg,abi,abik,abl,abraham,abubakar,abuja,abuja today,abus,...,ye,year,yesterday,yin,yoruba,young,youth,youth walk,zero,zlatan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Columns: 1000 entries, abdullahi to zlatan
dtypes: int64(1000)
memory usage: 76.3 MB


### Append sentiment column to vectorized dataframe

In [31]:
X_df['sentiments'] = df['sentiment']

In [32]:
X_df

Unnamed: 0,abdullahi,abeg,abi,abik,abl,abraham,abubakar,abuja,abuja today,abus,...,year,yesterday,yin,yoruba,young,youth,youth walk,zero,zlatan,sentiments
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Split dataset into dependent and independent variables for modelling

In [33]:
X = X_df.drop('sentiments', axis=1)
y = X_df.sentiments

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 1000)
(2001, 1000)
(8000,)
(2001,)


In [36]:
log_reg = LogisticRegression(max_iter=3000).fit(X_train, y_train)

In [37]:
log_reg.score(X_train, y_train)

0.775875

In [38]:
log_reg.score(X_test, y_test)

0.6846576711644178

---

### saving model

In [39]:
import pickle

In [40]:
with open('log_reg_tinubu.pkl', 'wb') as file1:
    pickle.dump(log_reg, file=file1)
    
with open('vectorizer_tinubu.pkl', 'wb') as file2:
    pickle.dump(vect, file=file2)

In [42]:
a = {'a': 1, 'b': 2}
pd.DataFrame(a, columns=a.keys)

TypeError: 'builtin_function_or_method' object is not iterable