In [32]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer, punkt, TweetTokenizer
from nltk.corpus import stopwords,state_union
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import sys
import string
import json
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import preprocessor as p

getdata=pd.read_csv('./Twitter_Data.csv',encoding='ISO-8859-1')
train_data= getdata.dropna()
train_data=pd.DataFrame({"SentimentText": tweets, "Sentiment": classes})
#Emoticons
tweets_text = train_data.SentimentText.str.cat()
emos = set(re.findall(r" ([xX:;][-']?.) ",tweets_text))
emos_count = []
for emo in emos:
    emos_count.append((tweets_text.count(emo), emo))
#print(sorted(emos_count,reverse=True))
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "
print("Happy emoticons:", set(re.findall(HAPPY_EMO, tweets_text)))
print("Sad emoticons:", set(re.findall(SAD_EMO, tweets_text)))

def most_used_words(text):
    tokens = word_tokenize(text)
    frequency_dist = nltk.FreqDist(tokens)
    print("There is %d different words" % len(set(tokens)))
    return sorted(frequency_dist, key=frequency_dist.__getitem__, reverse=True)

def stop_words():
    mw = most_used_words(train_data.SentimentText.str.cat())
    most_words = []
    for w in mw:
        if len(most_words) == 1000:
            break
        if w in stopwords.words("english"):
            continue
        else:
            most_words.append(w)
    print(sorted(most_words))

def stem_tokenize(text):
    stemmer = SnowballStemmer("english")
    stemmer = WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(text)]

def lemmatize_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]


class TextPreProc(BaseEstimator, TransformerMixin):
    def __init__(self, use_mention=False):
        self.use_mention = use_mention

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # We can choose between keeping the mentions
        # or deleting them
        if self.use_mention:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", " @tags ")
        else:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", "")

        # Keeping only the word after the #
        X = X.str.replace("#", "")
        X = X.str.replace(r"[-\.\n]", "")
        # Removing HTML garbage
        X = X.str.replace(r"&\w+;", "")
        # Removing links
        X = X.str.replace(r"https?://\S*", "")
        # replace repeated letters with only two occurences
        # heeeelllloooo => heelloo
        X = X.str.replace(r"(.)\1+", r"\1\1")
        # mark emoticons as happy or sad
        X = X.str.replace(HAPPY_EMO, " happyemoticons ")
        X = X.str.replace(SAD_EMO, " sademoticons ")
        X = X.str.lower()
        return X

sentiments = train_data['Sentiment']
tweets = train_data['SentimentText']

# I get those parameters from the 'Fine tune the model' part
vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenize, ngram_range=(1,2))
pipeline = Pipeline([
    ('text_pre_processing', TextPreProc(use_mention=True)),
    ('vectorizer', vectorizer),
])
#  split data into learning set and testing set
learn_data, test_data, sentiments_learning, sentiments_test = train_test_split(tweets, sentiments, test_size=0.3)
learning_data = pipeline.fit_transform(train_data['SentimentText'].apply(lambda learning_data: np.str_(learning_data)))


#lr = LogisticRegression()
#bnb = BernoulliNB()
mnb = MultinomialNB()

models = {
    #'logitic regression': lr,
    #'bernoulliNB': bnb,
    'multinomialNB': mnb,
}
for model in models.keys():
    scores = cross_val_score(models[model], learning_data, sentiments_learning, scoring="f1", cv=10)
    print("===", model, "===")
    print("scores = ", scores)
    print("mean = ", scores.mean())
    print("variance = ", scores.var())
    models[model].fit(learning_data, sentiments_learning)
    print("score on the learning data (accuracy) = ", accuracy_score(models[model].predict(learning_data), sentiments_learning))
    print("")


grid_search_pipeline = Pipeline([
    ('text_pre_processing', TextPreProc()),
    ('vectorizer', TfidfVectorizer()),
    ('model', MultinomialNB()),
])

params = [
    {
        'text_pre_processing__use_mention': [True, False],
        'vectorizer__max_features': [1000, 2000, 5000, 10000, 20000, None],
        'vectorizer__ngram_range': [(1,1), (1,2)],
    },
]
grid_search = GridSearchCV(grid_search_pipeline, params, cv=5, scoring='f1')
grid_search.fit(learn_data, sentiments_learning)
print(grid_search.best_params_)


Happy emoticons: set()
Sad emoticons: set()


ValueError: Found input variables with inconsistent numbers of samples: [162969, 114078]

In [17]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
nRowsRead = 100000 # specify 'None' if want to read whole file
df2 = pd.read_csv('./Twitter_Data.csv', delimiter=',', nrows = nRowsRead)
df2.dataframeName = 'Twitter_Data.csv'
nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 100000 rows and 2 columns


In [18]:
df2.head(5)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,1
3,asking his supporters prefix chowkidar their n...,1
4,answer who among these the most powerful world...,1


In [20]:
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, train_test_split

In [23]:
data1 = pd.read_csv('Twitter_Data.csv')
data1 = data1.dropna()
data1

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [25]:
tweets = list(data1['clean_text']) 
classes = []

for r in data1["category"]:
    if  r == -1:
        classes.append("Negative")
    if  r == 0:
        classes.append("Neutral")
    if  r == 1:
        classes.append("Positive")
    if r != 0 and r!= 1 and r != -1:
          print(r)
    
base = pd.DataFrame({"tweet": tweets, "category": classes})
base

Unnamed: 0,tweet,category
0,when modi promised “minimum government maximum...,Negative
1,talk all the nonsense and continue all the dra...,Neutral
2,what did just say vote for modi welcome bjp t...,Positive
3,asking his supporters prefix chowkidar their n...,Positive
4,answer who among these the most powerful world...,Positive
...,...,...
162964,why these 456 crores paid neerav modi not reco...,Negative
162965,dear rss terrorist payal gawar what about modi...,Negative
162966,did you cover her interaction forum where she ...,Neutral
162967,there big project came into india modi dream p...,Neutral


In [28]:
tweets = base["tweet"]
classes = base['category']
tweets,classes

(0         when modi promised “minimum government maximum...
 1         talk all the nonsense and continue all the dra...
 2         what did just say vote for modi  welcome bjp t...
 3         asking his supporters prefix chowkidar their n...
 4         answer who among these the most powerful world...
                                 ...                        
 162964    why these 456 crores paid neerav modi not reco...
 162965    dear rss terrorist payal gawar what about modi...
 162966    did you cover her interaction forum where she ...
 162967    there big project came into india modi dream p...
 162968    have you ever listen about like gurukul where ...
 Name: tweet, Length: 162969, dtype: object,
 0         Negative
 1          Neutral
 2         Positive
 3         Positive
 4         Positive
             ...   
 162964    Negative
 162965    Negative
 162966     Neutral
 162967     Neutral
 162968    Positive
 Name: category, Length: 162969, dtype: object)

In [None]:
def Preprocessing(data):
    stemmer = nltk.stem.RSLPStemmer()
    data = re.sub(r"http\S+", "", data).lower().replace('.','').replace(';','').replace('-','').replace(':','').replace(')','')
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = [stemmer.stem(i) for i in data.split() if not i in stopwords]
    return (" ".join(words))

tweets = [Preprocessing(i) for i in tweets]


In [None]:
vectorizer = CountVectorizer(analyzer="word")
freq_tweets_train = vectorizer.fit_transform(tweets)
model = MultinomialNB()
model.fit(freq_tweets_train,y_train)
for t, c in zip (X_test,model.predict(freq_tweets_test)):
    print (t +", "+ c)
