## Importing required libraries

In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

## Reading Dataset as a pandas data frame

In [2]:
imdb = pd.read_csv("E:/Momo/Datasets/imdbreviews/IMDB Dataset.csv")
ques = pd.read_csv("E:/Momo/Datasets/questions/train.csv")

ques = ques.head(25000)
ques["sentiment"] = ["question"]*25000
ques = ques.drop(["context", "id", "title", "answers"], axis= 1)
ques.columns = ["review", "sentiment"]

dataset = imdb.append(ques, ignore_index=True)

# dataset["sentiment"] = dataset["sentiment"].replace("positive",1).replace("negative",0).replace("question", 2)

  dataset = imdb.append(ques, ignore_index=True)


## Preprocessing dataset

Removing words and symbols that do not contain significant meaning.

- Removing HTML tags
- Removing URLs
- Removing speacial characters

In [5]:

def remove_html_tags(text):
    soup = BeautifulSoup(text,"html.parser")
    return soup.get_text()

def remove_urls(text):
    return re.sub("http\S+","",text)

def remove_special_characters(text):
    return re.sub(r'[^\w\s\?]', '', text)

def clean_text(text):
    text = remove_html_tags(text)
    text = remove_urls(text)
    text = remove_special_characters(text)
    return text

In [6]:
dataset["review"] = dataset["review"].apply(clean_text)



## Splitting dataset into Training and Testing data

In [8]:
# Split features and labels
X = dataset["review"]
y = dataset["sentiment"]

# Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=0)

print(X_train.shape, X_test.shape)

(45000,) (30000,)


## TF-IDF Vectorization

In [9]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

## Creating SVM model

In [10]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(y_test, prediction_linear, output_dict=True)

Training time: 1244.436655s; Prediction time: 802.145438s


In [11]:
print(report)

{'negative': {'precision': 0.9053930992613579, 'recall': 0.8834040872741633, 'f1-score': 0.8942634419348392, 'support': 10129}, 'positive': {'precision': 0.883325037332006, 'recall': 0.9035641547861507, 'f1-score': 0.893329977347093, 'support': 9820}, 'question': {'precision': 0.9979150119142176, 'recall': 1.0, 'f1-score': 0.9989564180291209, 'support': 10051}, 'accuracy': 0.9290666666666667, 'macro avg': {'precision': 0.9288777161691938, 'recall': 0.9289894140201046, 'f1-score': 0.928849945770351, 'support': 30000}, 'weighted avg': {'precision': 0.9291674117922798, 'recall': 0.9290666666666667, 'f1-score': 0.9290335246172379, 'support': 30000}}


In [12]:
classes = ["negative", "positive", "question"]
review = "this isnt accurate, i dont like it"
review_vector = vectorizer.transform([review]) # vectorizing
print(classifier_linear.predict(review_vector))

['negative']
