<h1> Importing pandas and other basic libraries</h1>

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk

<h1>Get the dataset (Safaricom hashtag tweets upto 31st March)</h1>

In [None]:
file = pd.read_csv("../input/hashtag-safaricom-tweets/safaricom_tweets.csv")
file.head()

<h1>Data overview</h1>

In [None]:
file.columns

<h1>Rows and Columns count</h1>

In [None]:
file.shape

In [None]:
tweets_df = file[["Tweet Id", "Screen Name", "Text"]]
tweets_df.head()

In [None]:
tweets_df.shape

<h1>Data cleaning and preprocessing</h1>
<ul>
<li>Tokenization</li>
<li>Lemmertization</li>
<li>Remove punctuation tags</li>
<li>Remove emojis</li>
<li>Strip numerical values</li>
<li>Remove stop-words</li>
</ul>

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

In [None]:
!pip install emoji

In [None]:
import emoji
def tokenize_tweets(text):
  #remove emojis
  text = emoji.demojize(text)
  #remove urls
  text = re.sub('http[s]?://\S+', '', text)
  #remove punctuations
  text = re.sub(r'[^\w\s]','',text)
  #strip numbers
  text = re.sub('[0-9]+', '', text)
  text = word_tokenize(text)
  
  return text
tweets_df["Tweets"] = tweets_df["Text"].apply(lambda x: tokenize_tweets(x))
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words("english")
tweets_df["stop_words"] = tweets_df["Tweets"].apply(lambda x: [w for w in x if w in stop])
tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [w.lower() for w in x if w not in stop])

tweets_df.head(10)

In [None]:
tweets_df.head()

In [None]:
string.punctuation

In [None]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
tweets_df["Tweets"] = tweets_df["Tweets"].apply(lambda x: [stemmer.stem(w) for w in x])
tweets_df.head()

In [None]:

def remove_punct(text):
  text = " ".join([char for char in text if char not in string.punctuation])
  text = re.sub('[0-9]+', '', text)
  
  
  return text
tweets_df['tweet_punct'] = tweets_df['Tweets'].apply(lambda x: remove_punct(x))


In [None]:
tweets_df.head(10)

In [None]:

tweets_df.head()

<h1>Data visualization (word cloud)</h1>

In [None]:
import matplotlib.pyplot as plt
all_words = ' '.join([text for text in tweets_df['tweet_punct']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()


<h1>Get most frequent words</h1>

In [None]:
from collections import Counter

cnt = Counter()
for text in tweets_df["tweet_punct"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
!pip install vaderSentiment

<h1>Import sklearn</h1>

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

<h1>Getting sentiments label</h1>

In [None]:
def sentiment_score_compound(sentence):
    score = analyzer.polarity_scores(sentence)
    return score['compound']

def sentiment_score_pos(sentence):
    score = analyzer.polarity_scores(sentence)
    return score['pos']

def sentiment_score_neg(sentence):
    score = analyzer.polarity_scores(sentence)
    return score['neg']

def sentiment_score_neu(sentence):
    score = analyzer.polarity_scores(sentence)
    return score['neu']
tweets_df["tweets_sent_compound"] = tweets_df["tweet_punct"].apply(lambda x: sentiment_score_compound(x))
tweets_df["tweets_sent_pos"] = tweets_df["tweet_punct"].apply(lambda x: sentiment_score_pos(x))
tweets_df["tweets_sent_neg"] = tweets_df["tweet_punct"].apply(lambda x: sentiment_score_neg(x))
tweets_df.head()

In [None]:
tweets_df.tail()

In [None]:
wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()

<h1>Vectorization</h1>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
tweets_list = []
for tweet in tweets_df["tweet_punct"]:
  
  tweets_list.append(tweet)
len(tweets_list)
#tweets_df.shape


In [None]:
X = tweets_df["tweet_punct"]

vec = TfidfVectorizer(min_df=5, max_df=0.95, sublinear_tf = True,use_idf = True,ngram_range=(1, 2))

In [None]:
len(all_words)

<p> If tweet is negative lable is 0 neutral 1  positive 2 </p>
<p> This is because classifiers only take integers </p> 

In [None]:
def label_value(val):
  if val < 0:
    return 0
  elif val == 0:
    return 1
  else:
    return 2
tweets_df["label"] = tweets_df["tweets_sent_compound"].apply(lambda x: label_value(x))
tweets_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(tweets_list)
X = cv.transform(tweets_list)
y = tweets_df["label"].values


<h1>Classification model</h1>

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.2, random_state = 0
)


<h1>Logistic Regression</h1>

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
pred = lr.predict(X_val)
print(accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))

<h1>tf-idf vectorization</h1>

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(tweets_list)
X = tfidf_vectorizer.transform(tweets_list)
y = tweets_df["label"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.2, random_state = 0
)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
pred = lr.predict(X_val)
print(accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))

<h1>Support Vector Machine</h1>

In [None]:
from sklearn.svm import LinearSVC
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3))
ngram_vectorizer.fit(tweets_list)
X = ngram_vectorizer.transform(tweets_list)
y = tweets_df["label"].values
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.2, random_state = 0
)
svm = LinearSVC()
svm.fit(X_train, y_train)

In [None]:
pred = svm.predict(X_val)
print(accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))

<h1>Naïve Bayes classifier</h1>

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
pred = MNB.predict(X_val)
print(accuracy_score(y_val, pred))
print(classification_report(y_val, pred))
print(confusion_matrix(y_val, pred))