In [1]:
# importing all the important libraires
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# on positive review

pos_rev = pd.read_csv('netflix/pos.txt', encoding = 'latin-1', header = None , sep='\n')
pos_rev['mood'] = 1
pos_rev.rename(columns={0:'review'} , inplace = True)
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [3]:
# on negative review

neg_rev = pd.read_csv('netflix/negative.txt', encoding = 'latin-1', header = None , sep='\n')
neg_rev['mood'] = 0
neg_rev.rename(columns={0:'review'} , inplace = True)
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [4]:
# lower ---------> punctuation -------> stopwords -------> lemmatising --------> join to string

In [5]:
# initialise the stop words

sw = stopwords.words('english')

In [6]:
# cleaning the positive data

pos_rev['review'] = pos_rev.review.apply(lambda x : x.lower())  # converting to lower case
pos_rev['review'] = pos_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in sw])) # remove stop-->join
pos_rev['review'] = pos_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in string.punctuation]))

In [7]:
# cleaning the negative data

neg_rev['review'] = neg_rev.review.apply(lambda x : x.lower())  # converting to lower case
neg_rev['review'] = neg_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in sw])) # remove stop-->join
neg_rev['review'] = neg_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in string.punctuation]))

In [8]:
# concat two dataframe

com_rev = pd.concat([pos_rev , neg_rev], axis = 0).reset_index(drop = True)
com_rev

Unnamed: 0,review,mood
0,rock destined 21st century's new conan he's go...,1
1,gorgeously elaborate continuation lord rings t...,1
2,effective too-tepid biopic,1
3,sometimes like go movies fun wasabi good place...,1
4,emerges something rare issue movie that's hone...,1
...,...,...
10657,terrible movie people nevertheless find moving,0
10658,many definitions 'time waster' movie must sure...,0
10659,stands crocodile hunter hurried badly cobbled ...,0
10660,thing looks like made-for-home-video quickie,0


In [9]:
# train test split

X_train , X_test, y_train , y_test = train_test_split(com_rev['review'].values ,com_rev['mood'].values, test_size = 0.2, random_state = 101)

In [10]:
train_data = pd.DataFrame({'review':X_train , 'mood':y_train})
test_data = pd.DataFrame({'review':X_test , 'mood':y_test})

In [11]:
train_data

Unnamed: 0,review,mood
0,puts washington honest working man john q arch...,0
1,poignant familiar story young person suspended...,1
2,timely director could ever dreamed quietly lyr...,1
3,film virtually chokes self-consciousness,0
4,film takes inside rhythms subject experience w...,1
...,...,...
8524,branagh forceful non-shakespeare screen perfor...,1
8525,movie friday fans critics damned already like ...,0
8526,perhaps heaviest joyless movie ever made giant...,0
8527,film rival live fine little amuse-bouche keep ...,1


In [12]:
train_data

Unnamed: 0,review,mood
0,puts washington honest working man john q arch...,0
1,poignant familiar story young person suspended...,1
2,timely director could ever dreamed quietly lyr...,1
3,film virtually chokes self-consciousness,0
4,film takes inside rhythms subject experience w...,1
...,...,...
8524,branagh forceful non-shakespeare screen perfor...,1
8525,movie friday fans critics damned already like ...,0
8526,perhaps heaviest joyless movie ever made giant...,0
8527,film rival live fine little amuse-bouche keep ...,1


In [15]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film move us ma...,1
1,i've never seen heard anything quite like film...,1
2,ending leave unfulfilled performances enjoy me...,1
3,surface lovers-on-the-run crime flick lot comm...,1
4,walk remember shrewd enough activate girlish t...,0
...,...,...
2128,bullock good job working natural likability,1
2129,results memorable least interesting,1
2130,apparently designed reverie memory regret thin...,0
2131,movie insecure capacity excite churns one two ...,0


In [None]:
loaded -----> clean data -----> vector --- ml alrgorith ---> tweak -----> deploy

In [16]:
vectoriser = TfidfVectorizer()
train_vectors = vectoriser.fit_transform(train_data['review'])
test_vectors = vectoriser.transform(test_data['reveiw'])

In [None]:
# svm
# naiive bayes

In [17]:
from sklearn import svm
from sklearn.metrics import classification_report

In [18]:
classifier = svm.SVC()
classifier.fit(train_vectors, train_data['mood'])

SVC()

In [20]:
pred = classifier.predict(test_vectors)

In [24]:
report = classification_report(test_data['mood'] , pred , output_dict=True)
print(f"positve {report['1']['recall']}")
print(f"neagtive {report['0']['recall']}")

positve 0.7435185185185185
neagtive 0.7739791073124407


{'0': {'precision': 0.7463369963369964,
  'recall': 0.7739791073124407,
  'f1-score': 0.7599067599067598,
  'support': 1053},
 '1': {'precision': 0.7713736791546589,
  'recall': 0.7435185185185185,
  'f1-score': 0.7571900047147571,
  'support': 1080},
 'accuracy': 0.7585560243788092,
 'macro avg': {'precision': 0.7588553377458276,
  'recall': 0.7587488129154796,
  'f1-score': 0.7585483823107584,
  'support': 2133},
 'weighted avg': {'precision': 0.759013797763661,
  'recall': 0.7585560243788092,
  'f1-score': 0.7585311876576445,
  'support': 2133}}

In [26]:
import joblib
joblib.dump(vectoriser , 'tfidf_vector_model.pkl')
joblib.dump(classifier , 'netflix_75.pkl')

['netflix_75.pkl']

In [36]:

#load all the models

tfidf = joblib.load('tfidf_vector_model.pkl')
model = joblib.load('netflix_75.pkl')


# prediction

data = ['best movie']

vector = tfidf.transform(data)
my_pred = model.predict(vector)

if my_pred[0] == 1:
    print('positve review')
else:
    print('negative review')
    

positve review


In [32]:
my_pred

array([0])

<1x16390 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [42]:
tfidf.get_feature_names()

['00',
 '000',
 '007',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '11th',
 '12',
 '127',
 '129',
 '12th',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '146',
 '15',
 '15th',
 '16',
 '163',
 '168',
 '17',
 '170',
 '179',
 '18',
 '180',
 '1899',
 '18th',
 '19',
 '1915',
 '1930s',
 '1934',
 '1937',
 '1938',
 '1940s',
 '1949',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1967',
 '1970s',
 '1971',
 '1972',
 '1975',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1982',
 '1984',
 '1986',
 '1987',
 '1990',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '2000',
 '2002',
 '20th',
 '21',
 '21st',
 '22',
 '24',
 '2455',
 '25',
 '2525',
 '26',
 '270',
 '28k',
 '30',
 '300',
 '3000',
 '30s',
 '33',
 '37',
 '3d',
 '40',
 '400',
 '40s',
 '45',
 '451',
 '48',
 '4ever',
 '50',
 '500',
 '50s',
 '51',
 '51st',
 '52',
 '53',
 '5ths',
 '60',
 '60s',
 '65',
 '65th',
 '66',
 '70',
 '70s',

In [40]:
len(['00',
 '000',
 '007',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '11th',
 '12',
 '127',
 '129',
 '12th',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '146',
 '15',
 '15th',
 '16',
 '163',
 '168',
 '17',
 '170',
 '179',
 '18',
 '180',
 '1899',
 '18th',
 '19',
 '1915',
 '1930s',
 '1934',
 '1937',
 '1938',
 '1940s',
 '1949',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1967',
 '1970s',
 '1971',
 '1972',
 '1975',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1982',
 '1984',
 '1986',
 '1987',
 '1990',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '2000',
 '2002',
 '20th',
 '21',
 '21st',
 '22',
 '24',
 '2455',
 '25',
 '2525',
 '26',
 '270',
 '28k',
 '30',
 '300',
 '3000',
 '30s',
 '33',
 '37',
 '3d',
 '40',
 '400',
 '40s',
 '45',
 '451',
 '48',
 '4ever',
 '50',
 '500',
 '50s',
 '51',
 '51st',
 '52',
 '53',
 '5ths',
 '60',
 '60s',
 '65',
 '65th',
 '66',
 '70',
 '70s',
 '71',
 '72',
 '75',
 '76',
 '77',
 '79',
 '7th',
 '80',
 '800',
 '80s',
 '82',
 '8217',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '8th',
 '90',
 '90s',
 '91',
 '93',
 '94',
 '95',
 '97',
 '98',
 '99',])

150

In [None]:
task remove the numbers and lemmatise it
deploy on flask

reading - curse of dimensinality
- pca

own project - end to end -- till flask

spam classifer - dataset link - https://www.kaggle.com/venky73/spam-mails-dataset

In [None]:
tmr - 

practical on cosine simiarty from scratch

informational retreival comcept

coding part

5 major