In [28]:
import pandas as pd

In [29]:
# read the entire file into a python array
with open("D:\\NLP\\Apps_for_Android_5.json", 'rU') as f:
   data = f.readlines()
#print(data)

# remove the trailing "\n" from each line
data = map(lambda x: x.rstrip(), data)

# each element of 'data' is an individual JSON object.
# i want to convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object
# basically... add square brackets to the beginning
# and end, and have all the individual business JSON objects
# separated by a comma
data_json_str = "[" + ",".join(data) + "]"

# now, load it into pandas
df = pd.read_json(data_json_str)

  


In [30]:
print(df.head())

         asin helpful  overall  \
0  B004A9SDD8  [1, 1]        3   
1  B004A9SDD8  [0, 0]        5   
2  B004A9SDD8  [0, 0]        5   
3  B004A9SDD8  [3, 4]        5   
4  B004A9SDD8  [1, 1]        5   

                                          reviewText   reviewTime  \
0  Loves the song, so he really couldn't wait to ...   11 2, 2013   
1  Oh, how my little grandson loves this app. He'...   12 5, 2011   
2  I found this at a perfect time since my daught...  05 21, 2012   
3  My 1 year old goes back to this game over and ...   12 6, 2012   
4  There are three different versions of the song...   02 1, 2014   

       reviewerID                   reviewerName  \
0  A1N4O8VOJZTDVB                 Annette Yancey   
1  A2HQWU6HUKIEC7        Audiobook lover "Kathy"   
2  A1SXASF6GYG96I                  Barbara Gibbs   
3  A2B54P9ZDYH167  Brooke Greenstreet "Babylove"   
4   AFOFZDTX5UC6D                     C. Galindo   

                                             summary  unixReviewTim

In [31]:
#dataframe size
df.shape

(752937, 9)

In [32]:
#drop duplicates if any

df.drop_duplicates(subset=["reviewerID","reviewTime"],keep='first')
#size
df.shape
#therefore there are no duplicates

(752937, 9)

In [33]:
#finding the null values if any
df.isnull().sum()
#reviewr name has some null values but that doesn't affect our model so we need not impute any values 

asin                  0
helpful               0
overall               0
reviewText            0
reviewTime            0
reviewerID            0
reviewerName      58198
summary               0
unixReviewTime        0
dtype: int64

In [34]:
#df['overall'].describe()
#dropping all the columns except reviewText and overall because the data relevant to sentiment analysis is present in this columns only
df_updated=df.drop(columns=["asin","helpful","reviewTime","reviewerID","reviewerName","summary","unixReviewTime"])
df_updated

Unnamed: 0,overall,reviewText
0,3,"Loves the song, so he really couldn't wait to ..."
1,5,"Oh, how my little grandson loves this app. He'..."
2,5,I found this at a perfect time since my daught...
3,5,My 1 year old goes back to this game over and ...
4,5,There are three different versions of the song...
5,5,THis is just so cute and a great app for littl...
6,5,I watch my great grandson 4 days a week and it...
7,5,This app is wild and crazy. Little ones love ...
8,5,love love love this app. I was going through d...
9,5,"Very cute, with alot of items to move about. ..."


In [35]:
#now our dataframe consists of only 2 columns which has review text and rating
#we have to convert the overall column to the values of 0 and 1 because sentiment of a text will be either positive or negative 
#Values greater than or equal to 3 are considered positive and less than 3 are negative

def f(x):
    if(x>=3):
        return 1
    else:
        return 0

df_updated["Rating"]=df_updated['overall'].apply(f)


In [36]:
df_updated.columns
final_df=df_updated.drop(columns=["overall"])
final_df

Unnamed: 0,reviewText,Rating
0,"Loves the song, so he really couldn't wait to ...",1
1,"Oh, how my little grandson loves this app. He'...",1
2,I found this at a perfect time since my daught...,1
3,My 1 year old goes back to this game over and ...,1
4,There are three different versions of the song...,1
5,THis is just so cute and a great app for littl...,1
6,I watch my great grandson 4 days a week and it...,1
7,This app is wild and crazy. Little ones love ...,1
8,love love love this app. I was going through d...,1
9,"Very cute, with alot of items to move about. ...",1


In [43]:
final_df["Rating"].value_counts()


1    629839
0    123098
Name: Rating, dtype: int64

In [60]:
print(len(final_df["reviewText"].values))

752937


# PreProcessing of reviews

In [46]:
#Removal of URL's if any
import re
sample_text='hello  https://www.google.com/'

sample_text=re.sub(r"http\S+", "",sample_text)
print(sample_text)

hello  


In [48]:
#Removal of html tags if any

from bs4 import BeautifulSoup
sample_Text='hello <br> </br> hi'

soup = BeautifulSoup(sample_Text, 'lxml')
text = soup.get_text()
print(text)

hello   hi


In [65]:
#Expanding English language contractions in Python and smiley
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    #phrase = re.sub(r":)", "happy", phrase)
    phrase=phrase.replace(":)","good")

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

print(decontracted("hi i am happy :) "))


hi i am happy good 


In [None]:
#Removing html characters
import html
print(html.unescape('&#34;'))


In [44]:
# Combining all the above stundents .
import html
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(final_df['reviewText'].values):
    html.unescape(sentance)
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sentance.strip())

In [67]:
import html
print(html.unescape('&#34;'))


"
