# ReelFeel DataPreprocessing 

In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import pymongo
import spacy
from pymongo import MongoClient
tqdm.pandas()
import string
import re
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to C:\Users\Nitro
[nltk_data]     5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
lemmatizer = WordNetLemmatizer()

!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 640.0 kB/s eta 0:00:20
     --------------------------------------- 0.0/12.8 MB 487.6 kB/s eta 0:00:27
     --------------------------------------- 0.1/12.8 MB 655.4 kB/s eta 0:00:20
     --------------------------------------- 0.1/12.8 MB 708.1 kB/s eta 0:00:18
      --------------------------------------- 0.3/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.5/12.8 MB 1.8 MB/s eta 0:00:07
     - -------------------------------------- 0.6/12.8 MB 1.9 MB/s eta 0:00:07
     --- ------------------------------------ 1.0/12.8 MB 2.8 MB/s eta 0:00:05
     ---- ----------------------------------- 1.4/12.8 MB 3.2 MB/s eta 0:00:04
     ---- ---------------------------


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Pre-processing Function

In [10]:
def text_preprocessing(review):
  
  #changing into lower case

  review_text = review.lower()

  # Removing HTML tags

  review_text = BeautifulSoup(review_text , 'html.parser').get_text()

  # Removing Punctuations

  review_text = ''.join(char for char in review_text if char not in string.punctuation)

  # Removing Whitespaces

  review_text = re.sub(r'\s+' , ' ' , review_text)
  
  # Performing Lemmatization i.e converting words into base words
  nlp = spacy.load('en_core_web_sm')
  doc = nlp(review_text)
  lemmatized_tokens = [token.lemma_ for token in doc]
  
  
  # Removing Stopwords i.e words that add little to no meaning to the review

  review_clean_text = [word for word in lemmatized_tokens if word not in stopwords.words('english') ]

  clean_sentence = ' '.join(char for char in review_clean_text)

  return clean_sentence

### Understanding the Doc Object
The Doc object represents the entire input text, and it provides a way to access detailed linguistic information. Here’s a more detailed breakdown:

- Tokens: You can iterate over the Doc object to get individual tokens.
- Sentences: You can access sentences if the text contains multiple sentences.
- Entities: You can access named entities in the text.
- Linguistic Annotations: Each token in the Doc object has annotations like part-of-speech tags, dependencies, and lemmas.

In [11]:
test_text = 'Hi, there i am Lionel Messi, and i wear the jersey with the number 10 on it.    Tommorrow i will be playing against Chelsea and i hope we play well!!'

In [12]:
text_preprocessing(test_text)

'hi I lionel messi I wear jersey number 10 tommorrow I play chelsea I hope play well'

In [13]:
connection = MongoClient('localhost' , 27017)
db = connection['mydb']
reelfeel_connection = db['Sentiment_Data']

In [14]:
cursor = reelfeel_connection.find({})

In [15]:
reelfeel_df = pd.DataFrame(list(cursor))

In [16]:
reelfeel_df.head()

Unnamed: 0,_id,review,word count,positive
0,66422ae96d5c91bd1d010578,One of the other reviewers has mentioned that ...,307,1
1,66422ae96d5c91bd1d010579,A wonderful little production. <br /><br />The...,162,1
2,66422ae96d5c91bd1d01057a,I thought this was a wonderful way to spend ti...,166,1
3,66422ae96d5c91bd1d01057b,Basically there's a family where a little boy ...,138,0
4,66422ae96d5c91bd1d01057c,"Petter Mattei's ""Love in the Time of Money"" is...",230,1


In [17]:
reelfeel_df['review'] = reelfeel_df['review'].progress_apply(text_preprocessing)

  review_text = BeautifulSoup(review_text , 'html.parser').get_text()
100%|██████████| 50000/50000 [6:24:45<00:00,  2.17it/s]  


In [18]:
reelfeel_df.head()

Unnamed: 0,_id,review,word count,positive
0,66422ae96d5c91bd1d010578,one reviewer mention watch 1 oz episode hook r...,307,1
1,66422ae96d5c91bd1d010579,wonderful little production film technique una...,162,1
2,66422ae96d5c91bd1d01057a,I think wonderful way spend time hot summer we...,166,1
3,66422ae96d5c91bd1d01057b,basically family little boy jake think zombie ...,138,0
4,66422ae96d5c91bd1d01057c,petter matteis love time money visually stunni...,230,1


In [19]:
reelfeel_df['review'][0]

'one reviewer mention watch 1 oz episode hook right exactly happen methe first thing strike I oz brutality unflinche scene violence set right word go trust I show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inward privacy high agenda em city home manyaryan muslims gangstas latinos christians italian irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show would dare forget pretty picture paint mainstream audience forget charm forget romanceoz mess around first episode I ever see strike I nasty surreal I could say I ready I watch I develop taste oz get accustomed high level graphic violence violence injustice crook guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison e

In [20]:
reelfeel_df.drop('_id' , axis = 1 , inplace= True)

In [21]:
reelfeel_df.head()

Unnamed: 0,review,word count,positive
0,one reviewer mention watch 1 oz episode hook r...,307,1
1,wonderful little production film technique una...,162,1
2,I think wonderful way spend time hot summer we...,166,1
3,basically family little boy jake think zombie ...,138,0
4,petter matteis love time money visually stunni...,230,1


In [22]:
reelfeel_df.to_csv('Preprocessed_Lemmatized_Data.csv' , index= False)

In [None]:
# def text_preprocessing(review):
  
#   #changing into lower case

#   review_text = review.lower()

#   # Removing HTML tags

#   review_text = BeautifulSoup(review_text , 'html.parser').get_text()

#   # Removing Punctuations

#   review_text = ''.join(char for char in review_text if char not in string.punctuation)

#   # Removing Whitespaces

#   review_text = re.sub(r'\s+' , ' ' , review_text)

#   # Performing Stemming i.e converting words into base words
#   stemmer = PorterStemmer()
#   review_text = [stemmer.stem(word) for word in review_text.split()]
  
#   # Removing Stopwords i.e words that add little to no meaning to the review

#   review_clean_text = [word for word in review_text if word not in stopwords.words('english') ]

#   clean_sentence = ' '.join(char for char in review_clean_text)

#   return clean_sentence