In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("E:/Userfiles/Download/archive/Tamil_sentiments.csv",on_bad_lines='skip', sep = "\t", names=["Label","Comments"])
df

Unnamed: 0,Label,Comments
0,Negative,Enna da ellam avan seyal Mari iruku
1,Negative,This movei is just like ellam avan seyal
2,Positive,Padam vanthathum 13k dislike pottavaga yellam...
3,Positive,Neraya neraya neraya... ... V era level...thala
4,Positive,wow thavala sema mass....padam oru pundaikum ...
...,...,...
15739,Mixed_feelings,ivaru cinemala laam nalla tha prasuraaru...aa...
15740,Positive,Pattaya Kilaputhupaa trailer... !!!!! Get Raj...
15741,Mixed_feelings,En innum trending la varala? Ennada panringa ...
15742,not-Tamil,Rajnikant sir plz aap india ke pm ban jaao


In [3]:
df.columns

Index(['Label', 'Comments'], dtype='object')

In [4]:
df1= df.drop(['Label'], axis = 1)

In [5]:
df1 = df1.iloc[:5000, :]

In [6]:
df1

Unnamed: 0,Comments
0,Enna da ellam avan seyal Mari iruku
1,This movei is just like ellam avan seyal
2,Padam vanthathum 13k dislike pottavaga yellam...
3,Neraya neraya neraya... ... V era level...thala
4,wow thavala sema mass....padam oru pundaikum ...
...,...
4995,Màráñã Mass ..... Thalaivaaa . Sweet edung...
4996,Sirapana tharamana sambavam Iruku Ponugaluku....
4997,"Athana haters um odi poidunga , illa adichi u..."
4998,Sweet sapta porom....last dance semma


In [7]:
df1.isnull().sum()

Comments    0
dtype: int64

In [8]:
import string
import re   
import nltk
from nltk.corpus import stopwords

In [9]:
stemmer_func = nltk.stem.snowball.SnowballStemmer("english").stem

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer 

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
nltk.download('stopwords')
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Priya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def pre_process(text):
    text = str(text).lower()
    text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stop]
    text=" ".join(text)
    text = [stemmer_func(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [14]:
df1['clean_text'] = df1['Comments'].apply(lambda x: pre_process(x))

In [15]:
data = pd.DataFrame(df1)

In [16]:
data

Unnamed: 0,Comments,clean_text
0,Enna da ellam avan seyal Mari iruku,enna da ellam avan seyal mari iruku
1,This movei is just like ellam avan seyal,movei like ellam avan seyal
2,Padam vanthathum 13k dislike pottavaga yellam...,padam vanthathum k dislik pottavaga yellam ye...
3,Neraya neraya neraya... ... V era level...thala,neraya neraya neraya v era levelthala
4,wow thavala sema mass....padam oru pundaikum ...,wow thavala sema masspadam oru pundaikum aagathu
...,...,...
4995,Màráñã Mass ..... Thalaivaaa . Sweet edung...,màráñã mass thalaivaaa sweet edunga kond...
4996,Sirapana tharamana sambavam Iruku Ponugaluku....,sirapana tharamana sambavam iruku ponugaluku ...
4997,"Athana haters um odi poidunga , illa adichi u...",athana hater um odi poidunga illa adichi und...
4998,Sweet sapta porom....last dance semma,sweet sapta poromlast danc semma


In [17]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Priya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
from nltk.tokenize import word_tokenize

In [19]:
data['token_text'] = data['clean_text'].apply(word_tokenize)

In [20]:
def lemma(text):
    lemma_token = " "
    for word in text:
        lemma = word
        lemma_token = lemma_token + " " + lemma
    return lemma_token

In [21]:
data['Lemma_text'] = data['token_text'].apply(lemma)
data.head()

Unnamed: 0,Comments,clean_text,token_text,Lemma_text
0,Enna da ellam avan seyal Mari iruku,enna da ellam avan seyal mari iruku,"[enna, da, ellam, avan, seyal, mari, iruku]",enna da ellam avan seyal mari iruku
1,This movei is just like ellam avan seyal,movei like ellam avan seyal,"[movei, like, ellam, avan, seyal]",movei like ellam avan seyal
2,Padam vanthathum 13k dislike pottavaga yellam...,padam vanthathum k dislik pottavaga yellam ye...,"[padam, vanthathum, k, dislik, pottavaga, yell...",padam vanthathum k dislik pottavaga yellam y...
3,Neraya neraya neraya... ... V era level...thala,neraya neraya neraya v era levelthala,"[neraya, neraya, neraya, v, era, levelthala]",neraya neraya neraya v era levelthala
4,wow thavala sema mass....padam oru pundaikum ...,wow thavala sema masspadam oru pundaikum aagathu,"[wow, thavala, sema, masspadam, oru, pundaikum...",wow thavala sema masspadam oru pundaikum aag...


## Sentimental Analysis

In [22]:
from textblob import TextBlob

In [23]:
data['subjectivity'] = data["Lemma_text"].apply(lambda x: TextBlob(x).sentiment.subjectivity )
data

Unnamed: 0,Comments,clean_text,token_text,Lemma_text,subjectivity
0,Enna da ellam avan seyal Mari iruku,enna da ellam avan seyal mari iruku,"[enna, da, ellam, avan, seyal, mari, iruku]",enna da ellam avan seyal mari iruku,0.00
1,This movei is just like ellam avan seyal,movei like ellam avan seyal,"[movei, like, ellam, avan, seyal]",movei like ellam avan seyal,0.00
2,Padam vanthathum 13k dislike pottavaga yellam...,padam vanthathum k dislik pottavaga yellam ye...,"[padam, vanthathum, k, dislik, pottavaga, yell...",padam vanthathum k dislik pottavaga yellam y...,0.00
3,Neraya neraya neraya... ... V era level...thala,neraya neraya neraya v era levelthala,"[neraya, neraya, neraya, v, era, levelthala]",neraya neraya neraya v era levelthala,0.00
4,wow thavala sema mass....padam oru pundaikum ...,wow thavala sema masspadam oru pundaikum aagathu,"[wow, thavala, sema, masspadam, oru, pundaikum...",wow thavala sema masspadam oru pundaikum aag...,1.00
...,...,...,...,...,...
4995,Màráñã Mass ..... Thalaivaaa . Sweet edung...,màráñã mass thalaivaaa sweet edunga kond...,"[màráñã, mass, thalaivaaa, sweet, edunga, kond...",màráñã mass thalaivaaa sweet edunga kondadung,0.65
4996,Sirapana tharamana sambavam Iruku Ponugaluku....,sirapana tharamana sambavam iruku ponugaluku ...,"[sirapana, tharamana, sambavam, iruku, ponugal...",sirapana tharamana sambavam iruku ponugaluku...,0.00
4997,"Athana haters um odi poidunga , illa adichi u...",athana hater um odi poidunga illa adichi und...,"[athana, hater, um, odi, poidunga, illa, adich...",athana hater um odi poidunga illa adichi und...,0.00
4998,Sweet sapta porom....last dance semma,sweet sapta poromlast danc semma,"[sweet, sapta, poromlast, danc, semma]",sweet sapta poromlast danc semma,0.65


In [24]:
def sentiment_analysis(subjectivity):
    if subjectivity > 0:
        return 'Positive'
    elif subjectivity == 0:
        return 'Neutral'
    else:
        return 'Negative'

In [25]:
data['sub'] = data['subjectivity'].apply(sentiment_analysis)

In [26]:
Subjectivity = data[['Comments','sub']]
Subjectivity

Unnamed: 0,Comments,sub
0,Enna da ellam avan seyal Mari iruku,Neutral
1,This movei is just like ellam avan seyal,Neutral
2,Padam vanthathum 13k dislike pottavaga yellam...,Neutral
3,Neraya neraya neraya... ... V era level...thala,Neutral
4,wow thavala sema mass....padam oru pundaikum ...,Positive
...,...,...
4995,Màráñã Mass ..... Thalaivaaa . Sweet edung...,Positive
4996,Sirapana tharamana sambavam Iruku Ponugaluku....,Neutral
4997,"Athana haters um odi poidunga , illa adichi u...",Neutral
4998,Sweet sapta porom....last dance semma,Positive


In [27]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
Subjectivity['sub']=LE.fit_transform(Subjectivity['sub'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Subjectivity['sub']=LE.fit_transform(Subjectivity['sub'])


In [28]:
Subjectivity

Unnamed: 0,Comments,sub
0,Enna da ellam avan seyal Mari iruku,0
1,This movei is just like ellam avan seyal,0
2,Padam vanthathum 13k dislike pottavaga yellam...,0
3,Neraya neraya neraya... ... V era level...thala,0
4,wow thavala sema mass....padam oru pundaikum ...,1
...,...,...
4995,Màráñã Mass ..... Thalaivaaa . Sweet edung...,1
4996,Sirapana tharamana sambavam Iruku Ponugaluku....,0
4997,"Athana haters um odi poidunga , illa adichi u...",0
4998,Sweet sapta porom....last dance semma,1


In [29]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(Subjectivity['Comments'])
print(X.shape)
print(cv.get_feature_names())

(5000, 2500)
['00', '01', '02', '03', '04', '05', '06', '07', '08', '10', '100', '1000', '100k', '102k', '10k', '10m', '10million', '11', '11m', '12', '13', '135k', '13million', '14', '15', '150k', '15m', '16', '17', '18', '19', '1k', '1m', '1million', '1st', '20', '2000', '200k', '2018', '2019', '2020', '20m', '21', '23', '24', '25', '250', '25k', '26k', '28', '28k', '29', '29k', '2k', '2k19', '2m', '2point0', '30', '300', '31', '314k', '33', '33k', '34', '35', '36', '37', '38', '39', '39k', '3d', '3k', '3m', '40', '400', '42', '43k', '44k', '45', '47', '48', '4d', '50', '500', '500k', '50k', '52', '53', '54', '55', '56', '580k', '5m', '60', '63', '64', '67', '68', '69', '6m', '70', '700k', '74k', '775k', '776', '777k', '7k', '80', '800', '800k', '80k', '80s', '82k', '8m', '90', '90s', '91k', '92k', '96', 'aa', 'aaa', 'aaaa', 'aaaaaa', 'aaana', 'aachi', 'aachu', 'aadukalam', 'aadum', 'aaga', 'aagala', 'aaganum', 'aagathu', 'aagudhu', 'aagum', 'aaguma', 'aagumnu', 'aaguthu', 'aah', 'aa



In [30]:
y = Subjectivity['sub']

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [32]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4000, 2500), (1000, 2500), (4000,), (1000,))

## Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
Classifier = RandomForestClassifier()
Classifier.fit(X_train, y_train)

RandomForestClassifier()

In [34]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
y_pred = Classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[754   4]
 [ 37 205]]
0.959
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       758
           1       0.98      0.85      0.91       242

    accuracy                           0.96      1000
   macro avg       0.97      0.92      0.94      1000
weighted avg       0.96      0.96      0.96      1000



In [35]:
import pickle

In [36]:
pkl_out=open("vectorizer.pkl", "wb")
pickle.dump(cv, pkl_out)
pkl_out.close()

In [37]:

pickle_out = open("emotions1.pkl","wb")
loaded_model = pickle.dump(Classifier, pickle_out)
pickle_out.close()