In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

ModuleNotFoundError: No module named 'nltk'

In [2]:
df = pd.read_csv('cleaned_obi.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,score
0,0,@user @user @user @user @user ....he invested ...,neutral,0.549163
1,1,@user @user @user,neutral,0.610686
2,2,@user God bless you for taking time to spell t...,positive,0.89305
3,3,@user @user AMEN. Happy birthday sir. Alhaji A...,positive,0.973588
4,4,@user @user @user Nwoke okpontu...,neutral,0.801305


In [4]:
sentiment_names = {
                    'negative': 0,
                    'neutral': 1,
                    'positive': 2
}

df['sentiment'] = df['sentiment'].map(sentiment_names)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,score
0,0,@user @user @user @user @user ....he invested ...,1,0.549163
1,1,@user @user @user,1,0.610686
2,2,@user God bless you for taking time to spell t...,2,0.89305
3,3,@user @user AMEN. Happy birthday sir. Alhaji A...,2,0.973588
4,4,@user @user @user Nwoke okpontu...,1,0.801305


In [6]:
df = df[['sentiment', 'score', 'tweet']]

In [7]:
df.head()

Unnamed: 0,sentiment,score,tweet
0,1,0.549163,@user @user @user @user @user ....he invested ...
1,1,0.610686,@user @user @user
2,2,0.89305,@user God bless you for taking time to spell t...
3,2,0.973588,@user @user AMEN. Happy birthday sir. Alhaji A...
4,1,0.801305,@user @user @user Nwoke okpontu...


In [8]:
def remove_user(text):
    text = re.sub(r'@[A-Za-z0-9^\w]+', '', str(text))
    return text

def remove_link(text):
    text = re.sub(r'http', '', str(text))
    return text    

In [9]:
df['tweet'] = df['tweet'].apply(remove_user)
df['tweet'] = df['tweet'].apply(remove_link)

In [10]:
[i for i in df.tweet.head()]

["     ....he invested so much in these sectors.... I've been in Lagos for 29yrs, why is the mainland still as it was then and now? With little or no development but massive development has been going on the island? I await your response.... Like I said, I want to engage you.",
 '  ',
 ' God bless you for taking time to spell these out.   ',
 '  AMEN. Happy birthday sir. Alhaji Atiku Abubakar.',
 '   Nwoke okpontu...']

In [11]:
word_tokens = [word_tokenize(tweet) for tweet in df.tweet]
cleaned_tokens = [[word for word in tweet if word.isalpha()] for tweet in word_tokens]
stemmed_tokens = [[PorterStemmer().stem(word) for word in tweet] for tweet in cleaned_tokens]

In [12]:
len_tokens = []

for i in range(len(stemmed_tokens)):
    len_tokens.append(len(stemmed_tokens[i]))
    
df['n_tokens'] = len_tokens

In [13]:
count = 0
for i in stemmed_tokens:
    df['tweet'][count] = ' '.join(stemmed_tokens[count])
    count += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet'][count] = ' '.join(stemmed_tokens[count])


In [14]:
[i for i in df.tweet.head()]

['he invest so much in these sector i been in lago for whi is the mainland still as it wa then and now with littl or no develop but massiv develop ha been go on the island i await your respons like i said i want to engag you',
 '',
 'god bless you for take time to spell these out',
 'amen happi birthday sir alhaji atiku abubakar',
 'nwoke okpontu']

In [15]:
df.head()

Unnamed: 0,sentiment,score,tweet,n_tokens
0,1,0.549163,he invest so much in these sector i been in la...,49
1,1,0.610686,,0
2,2,0.89305,god bless you for take time to spell these out,10
3,2,0.973588,amen happi birthday sir alhaji atiku abubakar,7
4,1,0.801305,nwoke okpontu,2


In [16]:
vect = CountVectorizer(max_features=1000, ngram_range=(1,2), max_df=500, stop_words=ENGLISH_STOP_WORDS)
vect.fit(df.tweet)
X = vect.transform(df.tweet)

In [17]:
X_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

In [18]:
X_df

Unnamed: 0,abeg,abi,abl,abov,abuja,abus,accept,account,achiev,act,...,wow,write,wrong,ye,year,yesterday,yoruba,young,youth,youth anambra
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Columns: 1000 entries, abeg to youth anambra
dtypes: int64(1000)
memory usage: 76.3 MB


In [20]:
X_df['sentiment'] = df['sentiment']

In [21]:
X_df

Unnamed: 0,abeg,abi,abl,abov,abuja,abus,accept,account,achiev,act,...,write,wrong,ye,year,yesterday,yoruba,young,youth,youth anambra,sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
X = X_df.drop('sentiment', axis=1)
y = X_df.sentiment

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [24]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8000, 1000)
(2001, 1000)
(8000,)
(2001,)


In [25]:
log_reg = LogisticRegression(max_iter=3000).fit(X, y)

In [28]:
log_reg.score(X_train, y_train)

0.789375

In [29]:
log_reg.score(X_test, y_test)

0.7926036981509246

---

### saving model

In [33]:
import joblib
import pickle

In [34]:
joblib.dump(log_reg, 'log_reg.joblib')

['log_reg.joblib']

In [35]:
with open('log_reg.pkl', 'wb') as file1:
    pickle.dump(log_reg, file=file1)
    
with open('vectorizer.pkl', 'wb') as file2:
    pickle.dump(vect, file=file2)

---

### test model

In [36]:
with open('log_reg.pkl', 'rb') as file1:
    md = pickle.load(file1)
    
with open('vectorizer.pkl', 'rb') as file2:
    vt = pickle.load(file2)

In [30]:
def clean_tweet(text):
    text = re.sub(r'@[A-Za-z0-9^\w]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'#(\w+)', '', text)
    text = re.sub(r'[^\w]', ' ', text)
    text = ' '.join(c for c in text.split() if c.isalpha())
    return text

In [92]:
tweet = {'tweet': ['@uchechi', 'obi will make the best president for nigeria', '@nnamdi obi will be the worst president ever #jagaban', 'I am tired of the politics of nigeria abeg']}
tweet = pd.DataFrame(tweet)
tweet['tweet'] = tweet['tweet'].apply(clean_tweet)

tweet

Unnamed: 0,tweet
0,
1,obi will make the best president for nigeria
2,obi will be the worst president ever
3,I am tired of the politics of nigeria abeg


In [90]:
tweet = vt.transform(tweet.tweet)
prediction = md.predict(tweet)

# 'negative': 0,
# 'neutral': 1,
# 'positive': 2



In [91]:
prediction

array([1, 2, 0, 1], dtype=int64)

In [1]:
import pandas as pd

In [3]:
test = pd.read_csv('C:/Users/USER/Documents/projects/data/twitter_api_data/atiku.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,date,user,source,tweet,location,like_count,retweet_count
0,0,2022-11-29 23:25:12+00:00,BUKOLA00459341,Twitter for Android,@pdpnewgen @PDPAwareness @MobilePunch @PulseNi...,"Maryland, USA",0,0
1,1,2022-11-29 23:07:51+00:00,DrCharlesOdogwu,Twitter for iPhone,#AtikuOkowa2023 #AtikuInOndo,Nigeria,0,0
2,2,2022-11-29 23:04:29+00:00,DavidOkooza,Twitter for iPhone,Borno state PDP is doing well.. We will recove...,"Blue Earth, MN",2,4
3,3,2022-11-29 22:57:15+00:00,MikanoLeader,Twitter for Android,Borno state PDP is doing well.. We will recove...,,59,26
4,4,2022-11-29 22:38:23+00:00,Adamujadas,Twitter for Android,Atiku/Okowa is the only best Candidate we have...,,2,0


In [131]:
df = pd.read_csv('cleaned_obi.csv')

In [144]:
test = df.head(10)

In [133]:
[i for i in test.tweet]

["@user @user @user @user @user ....he invested so much in these sectors.... I've been in Lagos for 29yrs, why is the mainland still as it was then and now? With little or no development but massive development has been going on the island? I await your response.... Like I said, I want to engage you.",
 '@user @user @user',
 '@user God bless you for taking time to spell these out.   ',
 '@user @user AMEN. Happy birthday sir. Alhaji Atiku Abubakar.',
 '@user @user @user Nwoke okpontu...',
 '@user @user @user @user @user ... Lagos Island only. Please bro, I also want to know why the so called health in Lagos is been disregarded by himself. Also, why do we have high rates of deaths in the government hospitals in Lagos? Why are schools in Lagos not topping the charts when it comes to WAEC since...',
 '@user @user @user @user Please kindly mention the k!llings carried out by IPOB that they told you of?',
 '@user @user @user @user @user I just tire for people bro',
 'My deepest appreciation 

In [134]:
def clean_tweet(text):
    text = re.sub(r'@[A-Za-z0-9^\w]+', '', str(text))
    text = re.sub(r'http', '', str(text))
    return text
    

def clean_token(text):
    word_tokens = word_tokenize(text)
    cleaned_tokens = []
    stemmed_tokens = []
    text = ''
#     for tweet in word_tokens:
#         if tweet.isalpha():
#             cleaned_tokens.append(tweet)
#     for tweet in cleaned_tokens:
#         stemmed_tokens.append(PorterStemmer().stem(tweet))
    for tweet in word_tokens:
        text = ' '.join(tweet)
    return text
            
#     word_tokens = [word_tokenize(tweet) for tweet in text]
#     cleaned_tokens = [[word for word in tweet if word.isalpha()] for tweet in word_tokens]
#     stemmed_tokens = [[PorterStemmer().stem(word) for word in tweet] for tweet in cleaned_tokens]
#     for tweet in stemmed_tokens:
#         text = tweet

In [135]:
test.tweet = test['tweet'].apply(clean_tweet)
test.tweet = test['tweet'].apply(clean_token)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.tweet = test['tweet'].apply(clean_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.tweet = test['tweet'].apply(clean_token)


In [136]:
[i for i in test.tweet]

['.', '', '.', '.', '. . .', '. . .', '?', 'b r o', '.', '.']

In [149]:
[word_tokenize(i) for i in test.tweet]
# word_tokenize((test.tweet[0]))

[['@',
  'user',
  '@',
  'user',
  '@',
  'user',
  '@',
  'user',
  '@',
  'user',
  '....',
  'he',
  'invested',
  'so',
  'much',
  'in',
  'these',
  'sectors',
  '....',
  'I',
  "'ve",
  'been',
  'in',
  'Lagos',
  'for',
  '29yrs',
  ',',
  'why',
  'is',
  'the',
  'mainland',
  'still',
  'as',
  'it',
  'was',
  'then',
  'and',
  'now',
  '?',
  'With',
  'little',
  'or',
  'no',
  'development',
  'but',
  'massive',
  'development',
  'has',
  'been',
  'going',
  'on',
  'the',
  'island',
  '?',
  'I',
  'await',
  'your',
  'response',
  '....',
  'Like',
  'I',
  'said',
  ',',
  'I',
  'want',
  'to',
  'engage',
  'you',
  '.'],
 ['@', 'user', '@', 'user', '@', 'user'],
 ['@',
  'user',
  'God',
  'bless',
  'you',
  'for',
  'taking',
  'time',
  'to',
  'spell',
  'these',
  'out',
  '.'],
 ['@',
  'user',
  '@',
  'user',
  'AMEN',
  '.',
  'Happy',
  'birthday',
  'sir',
  '.',
  'Alhaji',
  'Atiku',
  'Abubakar',
  '.'],
 ['@', 'user', '@', 'user', '@', 'use