### Import necessary requirements

In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

### Load the labelled data

In [4]:
df = pd.read_csv('labelled_obi.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,score,location,hashtags
0,0,@user @user @user @user @user ....he invested ...,neutral,0.549163,"Lagos, Nigeria",
1,1,@user @user @user,neutral,0.610686,,
2,2,@user God bless you for taking time to spell t...,positive,0.89305,"London, England",peterisbetter PeterObi4President2023 ObiDatti2023
3,3,@user @user AMEN. Happy birthday sir. Alhaji A...,positive,0.973588,,
4,4,@user @user @user Nwoke okpontu...,neutral,0.801305,ana igbo,


### Convert sentiment column to numerics

In [6]:
sentiment_names = {
                    'negative': 0,
                    'neutral': 1,
                    'positive': 2
}

df['sentiment'] = df['sentiment'].map(sentiment_names)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,score,location,hashtags
0,0,@user @user @user @user @user ....he invested ...,1,0.549163,"Lagos, Nigeria",
1,1,@user @user @user,1,0.610686,,
2,2,@user God bless you for taking time to spell t...,2,0.89305,"London, England",peterisbetter PeterObi4President2023 ObiDatti2023
3,3,@user @user AMEN. Happy birthday sir. Alhaji A...,2,0.973588,,
4,4,@user @user @user Nwoke okpontu...,1,0.801305,ana igbo,


### Select the necessary columns for model development

In [8]:
df = df[['sentiment', 'tweet']]

In [9]:
df.head()

Unnamed: 0,sentiment,tweet
0,1,@user @user @user @user @user ....he invested ...
1,1,@user @user @user
2,2,@user God bless you for taking time to spell t...
3,2,@user @user AMEN. Happy birthday sir. Alhaji A...
4,1,@user @user @user Nwoke okpontu...


### Data cleaning / preprocessing

In [10]:
def remove_user(text):
    text = re.sub(r'@[A-Za-z0-9^\w]+', '', str(text))
    return text

def remove_link(text):
    text = re.sub(r'http', '', str(text))
    return text    

In [11]:
df['tweet'] = df['tweet'].apply(remove_user)
df['tweet'] = df['tweet'].apply(remove_link)

In [12]:
[i for i in df.tweet.head()]

["     ....he invested so much in these sectors.... I've been in Lagos for 29yrs, why is the mainland still as it was then and now? With little or no development but massive development has been going on the island? I await your response.... Like I said, I want to engage you.",
 '  ',
 ' God bless you for taking time to spell these out.   ',
 '  AMEN. Happy birthday sir. Alhaji Atiku Abubakar.',
 '   Nwoke okpontu...']

In [13]:
word_tokens = [word_tokenize(tweet) for tweet in df.tweet]
cleaned_tokens = [[word for word in tweet if word.isalpha()] for tweet in word_tokens]
stemmed_tokens = [[PorterStemmer().stem(word) for word in tweet] for tweet in cleaned_tokens]

In [14]:
len_tokens = []

for i in range(len(stemmed_tokens)):
    len_tokens.append(len(stemmed_tokens[i]))
    
df['n_tokens'] = len_tokens

In [15]:
count = 0
for i in stemmed_tokens:
    df['tweet'][count] = ' '.join(stemmed_tokens[count])
    count += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet'][count] = ' '.join(stemmed_tokens[count])


In [16]:
[i for i in df.tweet.head()]

['he invest so much in these sector i been in lago for whi is the mainland still as it wa then and now with littl or no develop but massiv develop ha been go on the island i await your respons like i said i want to engag you',
 '',
 'god bless you for take time to spell these out',
 'amen happi birthday sir alhaji atiku abubakar',
 'nwoke okpontu']

In [17]:
df.head()

Unnamed: 0,sentiment,tweet,n_tokens
0,1,he invest so much in these sector i been in la...,49
1,1,,0
2,2,god bless you for take time to spell these out,10
3,2,amen happi birthday sir alhaji atiku abubakar,7
4,1,nwoke okpontu,2


### Create vectorizer for tweet column

In [26]:
stop_words = list(ENGLISH_STOP_WORDS)
stop_words

['because',
 'ie',
 'eleven',
 'well',
 'everything',
 'anywhere',
 'seemed',
 'fifty',
 'around',
 'latterly',
 'serious',
 'therein',
 'however',
 'hereupon',
 'an',
 'ltd',
 'always',
 'five',
 'there',
 'many',
 'some',
 'whole',
 'whose',
 'ever',
 'take',
 'top',
 'before',
 'per',
 'afterwards',
 'above',
 'been',
 'onto',
 'among',
 'former',
 'inc',
 'seem',
 'give',
 'us',
 'whether',
 'myself',
 'neither',
 'also',
 'full',
 'such',
 'becoming',
 'by',
 'so',
 'several',
 'beforehand',
 'hence',
 'enough',
 'became',
 'thence',
 'more',
 'his',
 'wherein',
 'themselves',
 'bill',
 'empty',
 'himself',
 'couldnt',
 'after',
 'do',
 'than',
 'i',
 'rather',
 'done',
 'front',
 'amount',
 'whereupon',
 'about',
 'here',
 'otherwise',
 'nevertheless',
 'is',
 'hasnt',
 'nor',
 'nine',
 'our',
 'how',
 'herein',
 'off',
 'never',
 'others',
 'though',
 'seems',
 'somehow',
 'that',
 'all',
 'sincere',
 'con',
 'co',
 'de',
 'two',
 'out',
 'seeming',
 'why',
 'still',
 'elsewhere

In [27]:
vect = CountVectorizer(max_features=1000, ngram_range=(1,2), max_df=500, stop_words=stop_words)
vect.fit(df.tweet)
X = vect.transform(df.tweet)

In [28]:
X_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

In [29]:
X_df

Unnamed: 0,abeg,abi,abl,abov,abuja,abus,accept,account,achiev,act,...,wow,write,wrong,ye,year,yesterday,yoruba,young,youth,youth anambra
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Columns: 1000 entries, abeg to youth anambra
dtypes: int64(1000)
memory usage: 76.3 MB


### Append sentiment column to vectorized dataframe

In [31]:
X_df['sentiment'] = df['sentiment']

In [32]:
X_df

Unnamed: 0,abeg,abi,abl,abov,abuja,abus,accept,account,achiev,act,...,write,wrong,ye,year,yesterday,yoruba,young,youth,youth anambra,sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Split dataset into dependent and independent variables for modelling

In [33]:
X = X_df.drop('sentiment', axis=1)
y = X_df.sentiment

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [35]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 1000)
(2001, 1000)
(8000,)
(2001,)


In [36]:
log_reg = LogisticRegression(max_iter=3000).fit(X_train, y_train)

In [37]:
log_reg.score(X_train, y_train)

0.8

In [38]:
log_reg.score(X_test, y_test)

0.7146426786606697

---

### saving model

In [39]:
import pickle

In [40]:
with open('log_reg.pkl', 'wb') as file1:
    pickle.dump(log_reg, file=file1)
    
with open('vectorizer.pkl', 'wb') as file2:
    pickle.dump(vect, file=file2)

---

### test model

In [41]:
with open('log_reg.pkl', 'rb') as file1:
    md = pickle.load(file1)
    
with open('vectorizer.pkl', 'rb') as file2:
    vt = pickle.load(file2)

In [42]:
def clean_tweet(text):
    text = re.sub(r'@[A-Za-z0-9^\w]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'#(\w+)', '', text)
    text = re.sub(r'[^\w]', ' ', text)
    text = ' '.join(c for c in text.split() if c.isalpha())
    return text

In [43]:
tweet = {'tweet': ['@uchechi', 'obi will make the best president for nigeria', '@nnamdi obi will be the worst president ever #jagaban', 'I am tired of the politics of nigeria abeg']}
tweet = pd.DataFrame(tweet)
tweet['tweet'] = tweet['tweet'].apply(clean_tweet)

tweet

Unnamed: 0,tweet
0,
1,obi will make the best president for nigeria
2,obi will be the worst president ever
3,I am tired of the politics of nigeria abeg


In [44]:
tweet = vt.transform(tweet.tweet)
prediction = md.predict(tweet)

# 'negative': 0,
# 'neutral': 1,
# 'positive': 2



In [45]:
prediction

array([1, 2, 0, 1], dtype=int64)