<a href="https://colab.research.google.com/github/Schota/shares/blob/master/NLP_Bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
trainset = pd.read_csv('train.tsv', delimiter = '\t', quoting = 3)
trainset

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
...,...,...,...
24995,"""3453_3""",0,"""It seems like more consideration has gone int..."
24996,"""5064_1""",0,"""I don't believe they made this film. Complete..."
24997,"""10905_3""",0,"""Guy is a loser. Can't get girls, needs to bui..."
24998,"""10194_3""",0,"""This 30 minute documentary Buñuel made in the..."


## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup

def review_to_words( raw_review):
  review_text = BeautifulSoup(raw_review).get_text() 
  letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
  words = review_text.lower().split()
  stops = set(stopwords.words('english'))
  meaningful_words = [w for w in words if not w in stops]
  for word in meaningful_words:
        word = wordnet_lemmatizer.lemmatize(word,'v')
  return( " ".join( meaningful_words )) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
num_reviews = trainset['review'].size
clean_train_reviews = []
for i in range( 0, num_reviews ):                                                                    
    clean_train_reviews.append( review_to_words( trainset["review"][i] ))

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer = "word",
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = None,  
                             max_features = 10000)
train_data_features = cv.fit_transform(clean_train_reviews).toarray()

In [None]:
train_data_features.shape

(25000, 10000)

## Training the classification model on the Training set

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(train_data_features, trainset["sentiment"], test_size=0.2)

classifier = RandomForestClassifier(n_estimators = 100)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
predictions = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predictions))

[[2161  360]
 [ 400 2079]]


In [None]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2521
           1       0.85      0.84      0.85      2479

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



## Predicting the Test set results

In [None]:
testset = pd.read_csv('test.tsv', delimiter = '\t', quoting = 3)
testset

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."
...,...,...
24995,"""2155_10""","""Sony Pictures Classics, I'm looking at you! S..."
24996,"""59_10""","""I always felt that Ms. Merkerson had never go..."
24997,"""2531_1""","""I was so disappointed in this movie. I am ver..."
24998,"""7772_8""","""From the opening sequence, filled with black ..."


In [None]:
num_reviews = len(testset["review"])
clean_test_reviews = [] 
for i in range(0,num_reviews):
    clean_review = review_to_words( testset["review"][i] )
    clean_test_reviews.append( clean_review )

In [None]:
test_data_features = cv.transform(clean_test_reviews).toarray()

In [None]:
test_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
result = classifier.predict(test_data_features)

output = pd.DataFrame( data={"id":testset["id"], "sentiment":result} )


output.to_csv( "results.csv", index=False, quoting=3 )