# Importing necessary libraries

In [None]:
import numpy as np
from matplotlib import pyplot
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_curve

# Reading dataset using pandas.

In [None]:
dataset = pd.read_csv('IMDB_Dataset.csv')             

# Cleaning dataset for Feature Extraction.

In [None]:
corpus = []                                                      #Creating An empty list.
for i in range(0,20000):                                            #Using only 20k reviews because it take lot of time.
    review = re.sub('[^a-zA-Z]', ' ', dataset['review'][i])      #Keeping Alphabets and removing punctuation  
    review = review.lower()                                      #converting to lower case
    review = review.split()                                      #tokenizing
    ps = PorterStemmer()                                         #stemming-reducing inflected words to their word stem
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)                                    #Removing unnecessary words and Join words to form sentence.
    corpus.append(review)                                        #Placing all the cleaned review into a list.

# Feature Extraction Using CountVectorizer

In [None]:
cv = CountVectorizer(max_features=5000)        #Extraction Features from the Cleaned review to classify its as postive or negative.                       

# Identify training and test data from the featureset.

In [None]:
X = cv.fit_transform(corpus).toarray()                                        #X dataset       

y = dataset.iloc[:, 1].values

y=y[:20000]                                                                      #cosidering output of only 20k reviews from 10k

le = preprocessing.LabelEncoder()                                             # Converting catagorical data 

y = le.fit_transform(y)                                                       # y set  
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)   #spliting X and y into train and test data wih ratio of 80:20    
 

# Create and train a Machine Learning model.

In [None]:
classifier = SVC(probability=True)                                 #using svm to classify reviews
classifier.fit(X_train, y_train)                   #fitting the X_train, y_train to the model and training the model

# Validate the model created.

In [None]:
y_pred = classifier.predict(X_test)                                        #useing unseen data to test the model

# verifying  the model performance for new reviews.
reviews = ["I love this movie","This movie is bad","I was going to say something awesome or great or good, but I simply can't because the movie is so bad.","It might have bad actors, but everything else is good.","This movie turned out to be better than I had expected it to be. Some parts were pretty funny. It was nice to have a movie with a new plot.","First one was much better, I had enjoyed it a lot. This one has not even produced a smile. The idea was showing how deep down can human kind fall, but in reference to the characters not the film-maker."]
for review in reviews:
    op=classifier.predict(cv.transform([review]).toarray())
    if op==[0] :
        print(review,'=','negative')
    else :
        print(review,'=','positive')

# Evaluate the model

In [None]:
print('SVM-Evaluation:\n')                                 
print('Accuracy score : ',accuracy_score(y_test, y_pred)*100,'\n')      #accuracy of the model     
print(confusion_matrix(y_test,y_pred),'\n')                             #Confusion Matrix
print(classification_report(y_test,y_pred))                             #Classification Report such recall precision etc


# Precision Recall curve

In [None]:
lr_probs = classifier.predict_proba(X_test)
                                                                           # keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
                                                                           # predict class values
yhat = classifier.predict(X_test)
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
                                                                            # plot the precision-recall curves
no_skill = len(y_test[y_test==1]) / len(y_test)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
pyplot.plot(lr_recall, lr_precision, marker='.', label='svm')
                                                                               # axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
                                                                                 # show the legend
pyplot.legend()
                                                                                 # show the plot
pyplot.show()