In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the necessary packages and modules

In [2]:
import re # for using regular expression (regex)
import pandas as pd # reading and processing the dataset
import matplotlib.pyplot as plt # for graphs and visualization
import seaborn as sns # plotting heatmaps
import pickle # to save the model weights for future use
import nltk # natural language toolkit library for text processing
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer # lemmatization package
from sklearn.feature_extraction.text import TfidfVectorizer # vectorizer

## Reading the dataset

In [9]:
# defining the column names of the dataframe
columns = ['target','id','date','flag','user','text']
# 'target' refers to the sentiment of the tweet (0 means negative, 2 means neutral and 4 means positive)
# 'text is the tweet by the user'
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1',names=columns)

## Dataset description and selecting the useful data

In [10]:
df.head(100)

In [11]:
df.shape

In [12]:
df.isnull().any()

In [13]:
# selecting only the target and text columns from the dataframe
df = df[['target','text']]
df

In [14]:
df['target'].value_counts()

In [15]:
# As it turns out the dataset does not have any neutral sentiment tweets, so now we can consider this as a binary classification problem

In [19]:
# plotting the counts of sentiments for each class 
sns.countplot(data=df,x='target')
plt.xticks(ticks=[0,1],labels=['Negative','Positive'])
plt.xlabel('Sentiment')
plt.ylabel('Value count of the sentiments')

In [20]:
# replacing the class notation of positive statements from 4 to 1
df['target'] = df['target'].replace(4,1)
df['target'].unique()

In [21]:
# to make use of emojis used in the dataset, we define a dictionary that maps emojis to their textual meaning.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused',
          '$_$': 'greedy','@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused','<(-_-)>': 'robot', 'd[-_-]b': 'dj', 
          ":'-)": 'sadsmile',';)': 'wink',';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}
# Stopwords are those words that do not contribute much to the sentiment/meaning o fthe text.
# These can be removed using Tfidf vectorization techniques.
stopword =  ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an','and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do','does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here','hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma','me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them','themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was', 'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre","youve", 'your', 'yours', 'yourself', 'yourselves']

In [22]:
# Preprocessing function to clean the data.
def preprocess(text , wordLemm):
    processedText = []
    
    # Regex patterns for handling urls, usernames and word patterns that do not contribute much to model training.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in text:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)
        
        tweetwords = ''
        for word in tweet.split():
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

Instantiate the `WordNetLemmatizer` and `TfidfVectorizer`

In [24]:
wordLemm = WordNetLemmatizer()
X = preprocess(list(df['text']),wordLemm)
y = df['target']

vect = TfidfVectorizer(ngram_range=(1,2), max_features=1000000,stop_words=stopword)

## Splitting the vectorized data to train and test sets

In [25]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)
pd.DataFrame(X_train , y_train).info

In [26]:
vect.fit(X_train)

X_train = vect.transform(X_train)
X_test = vect.transform(X_test)

## Using Logistic Regression and Multinomial Naive Bayes 

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Model performance parameters
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

### Multinomial Naive Bayes :

In [28]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train,y_train)

nb_pred = nb_clf.predict(X_test)
print('Accuracy of Multinomial Naive Bayes:',accuracy_score(y_test,nb_pred))

### Logistic Regression :

In [33]:
log_clf = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1, penalty = 'l2', solver = 'newton-cg')
log_clf.fit(X_train,y_train)

log_pred = log_clf.predict(X_test)
print('Accuracy of Logistic Regression:',accuracy_score(y_test,log_pred))

In [27]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

In [34]:
def tweet_analysis(str):
    inp = [str]
    print(log_clf.predict(vect.transform(inp)))

In [41]:
tweet_analysis("sentiment ananlysis is good :)")

**Logistic regression** is more accurate than others 

In [42]:
cm = confusion_matrix(y_test,log_pred)
print(cm)

plt.figure(figsize=(5,5))
sns.heatmap(cm,annot=True)
plt.show()

In [43]:
print(classification_report(y_test,log_pred))