In [None]:
import pandas as pd
pd.set_option("display.max_colwidth", 200)

In [None]:
data = pd.read_csv("tweets.csv")
data.head()

# 0 refers positive statements, 1 is negative statemens.

In [None]:
data.info

In [None]:
#check missing values
data.isnull().sum()

In [None]:
#removing id column
data.drop(['id'], axis=1, inplace=True)
data.head()

In [None]:
#checking for class balance
data['label'].value_counts(normalize=True)

In [None]:
#plotting label counts
data['label'].value_counts().plot(kind='pie',)

In [None]:
data['label'].value_counts().plot(kind='bar',)

# DATA CLEANING


In [None]:
data['tweet'][24]

In [None]:
import re

In [None]:
#substituting 's with "is"
re.sub(r"'s\b", "is", data['tweet'][24])

In [None]:
#remove user mentions
data['tweet'][11]

In [None]:
#keeping the aphabets and numbers
re.sub(r'[^a-zA-Z0-9]', ' ', data['tweet'][11])

In [None]:
#removing the hashtags
data['tweet'][0]
re.sub("#", "", data['tweet'][0])

In [None]:
#removing the links
re.sub(r"http\S+", "", data['tweet'][0])

In [None]:
#removing punctuations
data['tweet'][25]
re.sub(r"[^a-zA-Z]", "", data['tweet'][0])

In [None]:
#Removal of STOP WORDS
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk_stopwords = set(stopwords.words('english'))
print(nltk_stopwords)
len(nltk_stopwords)

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
sklearn_stopwords = set(ENGLISH_STOP_WORDS)
print(sklearn_stopwords)
len(sklearn_stopwords)

In [None]:
#common stopwords from NLTK & skLearn
print(nltk_stopwords.intersection(sklearn_stopwords))

In [None]:
len(nltk_stopwords.intersection(sklearn_stopwords))

In [None]:
#combine stopwords from skLearn & NLTK
combined_stopwords = nltk_stopwords.union(sklearn_stopwords)

In [None]:
#TEXT NORMALIZATION:Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()



In [None]:
data['tweet'][63].split()

In [None]:
new_sent = ''
for token in data['tweet'][63].split():
    new_sent = new_sent + lemmatizer.lemmatize(token.lower()) + ' '

new_sent

In [None]:
# pip install contractions
import contractions
data['tweet'][24]

In [None]:
contractions.fix(data['tweet'][24])

#DEFINING THE CLEANER FUNCTION

In [None]:
def tweet_cleaner_without_stopwords(text):
    new_text = re.sub(r"'s\b", " is", text)
    new_text = re.sub("#", "", new_text)
    new_text = re.sub("@[A-Za-z0-9]+", "", new_text)
    new_text = re.sub(r"http\S+", "", new_text)
    new_text = contractions.fix(new_text)    
    new_text = re.sub(r"[^a-zA-Z]", " ", new_text)    
    new_text = new_text.lower().strip()
    
    cleaned_text = ''
    for token in new_text.split():
        cleaned_text = cleaned_text + lemmatizer.lemmatize(token) + ' '
    
    return cleaned_text

In [None]:
cleaned_tweets = []  # list of cleaned tweets
for twt in data['tweet']:
    cleaned_tweets.append(tweet_cleaner_without_stopwords(twt))
    
print(cleaned_tweets[:5])

In [None]:
cleaned_tweets[24]
data['tweet'][1500]
cleaned_tweets[1500]

In [None]:
data['cleaned_tweets_w/o_SW'] = cleaned_tweets
data.head()

#DATA VISUALIZATION

In [None]:
#plotting 25 most common words in the dataset

In [None]:
#collecting all words from the dataset
all_words = []
for t in data['tweet']:
    all_words.extend(t.split())

print(all_words[:50])
len(set(all_words))

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Frequency Distribution
freq_dist = nltk.FreqDist(all_words)

plt.figure(figsize=(12,5))
plt.title('Top 25 most common words')
plt.xticks(fontsize=15)

freq_dist.plot(25, cumulative=False)

plt.show()

#PLOT FOR CLEANED TWEETS

In [None]:
all_words = []
for t in data['cleaned_tweets_w/o_SW']:
    all_words.extend(t.split())

print(all_words[:50])

In [None]:
len(set(all_words)) # this is the number of unique words in the list

In [None]:
# Frequency Distribution
freq_dist = nltk.FreqDist(all_words)

plt.figure(figsize=(12,5))
plt.title('Top 25 most common words')
plt.xticks(fontsize=15)

freq_dist.plot(25, cumulative=False)

plt.show()

#DV After applying Stop words

In [None]:
type(combined_stopwords)
def tweet_cleaner_with_stopwords(text):
    new_text = re.sub(r"'s\b", " is", text)
    new_text = re.sub("#", "", new_text)
    new_text = re.sub("@[A-Za-z0-9]+", "", new_text)
    new_text = re.sub(r"http\S+", "", new_text)
    new_text = contractions.fix(new_text)    
    new_text = re.sub(r"[^a-zA-Z]", " ", new_text)    
    new_text = new_text.lower().strip()
    
    new_text = [token for token in new_text.split() if token not in combined_stopwords]
    
    new_text = [token for token in new_text if len(token)>2]
    
    cleaned_text = ''
    for token in new_text:
        cleaned_text = cleaned_text + lemmatizer.lemmatize(token) + ' '
    
    return cleaned_text

In [None]:
cleaned_tweets = list(data['tweet'].apply(tweet_cleaner_with_stopwords))
print(cleaned_tweets[:10])

In [None]:
data.columns

In [None]:
data['cleaned_tweets_with_SW'] = cleaned_tweets
data.head()

In [None]:
all_words = []
for t in data['cleaned_tweets_with_SW']:
    all_words.extend(t.split())

print(all_words[:50])

# Frequency Distribution
freq_dist = nltk.FreqDist(all_words)

plt.figure(figsize=(12,5))
plt.title('Top 25 most common words')
plt.xticks(fontsize=15)

freq_dist.plot(25, cumulative=False)

plt.show()

In [None]:
domain_stopwords = ['phone', 'mobile', 'twitter', 'rt', 'com', 'follow']
final_stopwords = domain_stopwords + list(combined_stopwords)
data.head()

#BOW MODEL

In [None]:
data.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

CV = CountVectorizer()
CV_features = CV.fit_transform(data['cleaned_tweets_w/o_SW'])
CV_features.shape
CV_features[0]

In [None]:
type(CV_features[0])

In [None]:
CV_features[0].todense()

In [None]:
import pandas as pd
df = pd.DataFrame(CV_features.todense() )
df

In [None]:
df.size/1e6
import numpy as np
np.count_nonzero(df)

In [None]:
100*np.count_nonzero(df)/df.size

In [None]:
CV_features[10].todense()

In [None]:
CV.inverse_transform(np.asarray(CV_features[10].todense()))

#MODEL BUILDING

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(CV_features, data['label'], test_size=0.25, stratify=data['label'], random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(solver='liblinear')
LR.fit(X_train, y_train)

print(LR.score(X_train, y_train))  # train score)
print(LR.score(X_test, y_test))   # test score)  

#CROSS VALIDATING THE EXISTING MODEL

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

In [None]:
X = CV_features
y = data['label']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(LR1, X, y, cv=kfold, scoring='accuracy')
print(results)
print(np.round((results.mean()) * 100, 2), np.round((results.std()) * 100, 2))


In [None]:
#THE ACCURACY IS 88.23 +/- 0.93 %

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = cross_validate(LR1, X, y, cv=kfold, scoring='accuracy', return_train_score=True)


In [None]:
results

In [None]:
print(results['train_score'])
print(np.round((results['train_score'].mean()) * 100, 2), np.round((results['train_score'].std()) * 100, 2))


In [None]:
print(results['test_score'])
print(np.round((results['test_score'].mean()) * 100, 2), np.round((results['test_score'].std()) * 100, 2))


#HYPER PARAMETER TRAINING FOR OPTIMAL RESULTS


In [None]:
from sklearn.model_selection import GridSearchCV

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1')

C_values = np.arange(0.00001, 1, 0.05) # 20 values

grid = GridSearchCV(estimator=LR1, param_grid={'C': C_values}, cv=kfold, scoring='accuracy', \
                    return_train_score=True, verbose=2, n_jobs=-1)
grid_results = grid.fit(X,y)

In [None]:
grid_results

In [None]:
grid_results.best_params_, grid_results.best_score_, grid_results.best_index_

In [None]:
grid_results.cv_results_.keys()

In [None]:
grid_results.cv_results_['mean_test_score'][grid_results.best_index_]*100

In [None]:
grid_results.cv_results_['mean_train_score'][grid_results.best_index_]*100

In [None]:
grid_results.cv_results_['std_test_score'][grid_results.best_index_]*100

In [None]:
grid_results.cv_results_['mean_test_score']

In [None]:
grid_results.cv_results_['mean_train_score']

In [None]:
plt.plot(grid_results.cv_results_['mean_train_score'] - grid_results.cv_results_['mean_test_score'])

In [None]:
grid_results.param_grid

In [None]:
grid_results.param_grid['C'][3]

In [None]:
grid_results.cv_results_['mean_train_score'] - grid_results.cv_results_['mean_test_score']

# Creating a pipeline & cross-validate

In [None]:

from sklearn.pipeline import make_pipeline, Pipeline

X = data['cleaned_tweets_w/o_SW']
y = data['label']

CV = CountVectorizer()
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)

CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )

results = cross_val_score(CV_pipe, X, y, cv=kfold, scoring='accuracy')
print(np.round((results.mean())*100, 2), np.round((results.std())*100, 2))

In [None]:
CV_pipe.named_steps

In [None]:
CV_pipe.fit(X,y)

In [None]:
len(CV_pipe['CV'].vocabulary_)

#USE OF N-GRAM MODEL

In [None]:
#unigrams + bigrams
X = data['cleaned_tweets_w/o_SW']
y = data['label']

# we want to include only those words in the vocab which have min df of 5,
# means select only those words which occur ATLEAST in 5 documents!! 
# AND SELECT the TOP 1000 FEATURES ONLY to build the model
CV = CountVectorizer(stop_words=final_stopwords,  ngram_range=(1, 2), min_df=5)

LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4)
CV_pipe = Pipeline([('CV', CV) , ('LR', LR1)] )
results = cross_validate(CV_pipe, X, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 

CV.fit_transform(X)
len(CV.vocabulary_)

In [None]:
CV_pipe.named_steps

In [None]:
import joblib
joblib.__version__

In [None]:
import mglearn

In [None]:
feature_names = np.array(CV.get_feature_names_out())
LR1.fit(CV.fit_transform(X), y)
coef = LR1.coef_
mglearn.tools.visualize_coefficients(coef, feature_names, n_top_features=25)

In [None]:
len(coef.ravel()), len(feature_names)

In [None]:
mask = np.array([len(feature.split(" ")) for feature in feature_names]) == 2

LR1.fit(CV.fit_transform(X), y)
coef = LR1.coef_

# visualize only 2-gram features
mglearn.tools.visualize_coefficients(coef.ravel()[mask], feature_names[mask], n_top_features=25)

In [None]:
CV.vocabulary_ 

In [None]:

data.to_pickle("tweets_cleaned.pkl")

#WORD EMBEDDING


In [198]:
tweets_list = list(data['cleaned_tweets_w/o_SW'].apply(lambda x: x.split()))
tweets_list[0]

['fingerprint',
 'pregnancy',
 'test',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [199]:
from gensim.models import Word2Vec
# train model
cbow_model = Word2Vec(tweets_list, vector_size = 300, window = 3, min_count=5, sg=0)

In [200]:
print(cbow_model)

Word2Vec<vocab=2420, vector_size=300, alpha=0.025>


In [201]:
cbow_model.wv.index_to_key[:20]

['iphone',
 'apple',
 'i',
 'my',
 'the',
 'to',
 'a',
 'is',
 'samsung',
 'it',
 'and',
 'you',
 'new',
 'twitter',
 'for',
 'com',
 'phone',
 'me',
 'sony',
 'not']

In [202]:
len(cbow_model.wv.index_to_key)

2420

In [None]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow_model.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow_model.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

