<a href="https://colab.research.google.com/github/Vaishnavi-cyber-blip/Sentiment_api_-app/blob/main/Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
from zipfile import ZipFile
with ZipFile('Twitter_Data.csv.zip', 'r') as zipObj:
  zipObj.extractall()

In [3]:
df = pd.read_csv("/content/Twitter_Data.csv")
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
df.shape

(162980, 2)

In [5]:
df['category'].value_counts()

 1.0    72250
 0.0    55213
-1.0    35510
Name: category, dtype: int64

In [6]:
df['clean_text'][10]

'things like demonetisation gst goods and services tax…the upper castes would sort either view favourably say that need give this more time other castes like dalits the muslims were more against because that’ just not modi’ constituency2'

In [7]:
df.drop(df[df['clean_text'].isna()].index, inplace=True)
df.drop(df[df['category'].isna()].index, inplace=True)

In [8]:
df['clean_text'] = df['clean_text'].str.lower()
df['clean_text'].tail()

162975    why these 456 crores paid neerav modi not reco...
162976    dear rss terrorist payal gawar what about modi...
162977    did you cover her interaction forum where she ...
162978    there big project came into india modi dream p...
162979    have you ever listen about like gurukul where ...
Name: clean_text, dtype: object

Data preprocessing

In [9]:
import re
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
stopwords_set = set(stopwords.words('english'))
emoji_pattern = re.compile('(?::|;|=)(?:-)?(?:\)|\(|D|P)')

english_punctuations = string.punctuation
punctuations_list = english_punctuations


In [11]:

def cleaning_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stopwords_set])

def cleaning_punctuations(text):
  translator = str.maketrans('','', string.punctuation)
  return text.translate(translator)

def cleaning_repeating_char(text):
  return re.sub(r'(.)\1+', r'\1', text)

def cleaning_URLs(data):
  return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def stemming_on_text(data):
  stemmer = PorterStemmer()
  return [stemmer.stem(word) for word in data.split()]

def lemmatizer_on_text(data):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in data.split()]


def preprocessing(text):
  cleaned_text = cleaning_stopwords(text)
  cleaned_text = cleaning_punctuations(cleaned_text)
  cleaned_text = cleaning_repeating_char(cleaned_text)
  cleaned_text = cleaning_URLs(cleaned_text)
  cleaned_text = " ".join(stemming_on_text(cleaned_text))
  cleaned_text = " ".join(lemmatizer_on_text(cleaned_text))

  return cleaned_text

In [12]:
df['clean_text'] = df['clean_text'].apply(lambda x: preprocessing(x))

In [13]:
df['clean_text']

0         modi promis “minimum govern maximum governance...
1                      talk nonsens continu drama vote modi
2         say vote modi welcom bjp told rahul main campa...
3         ask suport prefix chowkidar name modi great se...
4         answer among power world leader today trump pu...
                                ...                        
162975    456 crore paid nerav modi recov congr leader h...
162976    dear r terorist payal gawar modi kile 10 plu m...
162977                            cover interact forum left
162978    big project came india modi dream project hape...
162979    ever listen like gurukul disciplin maintain ev...
Name: clean_text, Length: 162969, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(ngram_range=(1,2), max_features=5000)
y=df.category.values
x=tfidf.fit_transform(df.clean_text)

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.05, random_state =26105111)

In [16]:
from sklearn.linear_model import LogisticRegressionCV
clf=LogisticRegressionCV(cv=6,scoring='accuracy',random_state=0,n_jobs=-1,verbose=3,max_iter=500).fit(X_train,y_train)
y_pred = clf.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 10.1min finished


In [17]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8439072278807216


In [18]:
import pickle
pickle.dump(clf,open('clf.pkl','wb'))


In [19]:
pickle.dump(tfidf,open('tfidf.pkl','wb'))

In [20]:
def prediction(comment):
    preprocessed_comment = preprocessing(comment)
    comment_list = [preprocessed_comment]
    comment_vector = tfidf.transform(comment_list)
    prediction = clf.predict(comment_vector)[0]
    return prediction

prediction = prediction('service good')


In [21]:
if prediction == 1:
    print("positive comment")
elif prediction == -1:
    print("negative comment")
else:
  print("Neutral comment")

positive comment
