# Yelp Review Data
Loading, parsing, and filtering review data json from the Yelp Dataset Challenge Round 13 for sentiment analysis training.

### Load Dataset from file

In [1]:
def LoadYelpData():
    from datetime import datetime
    import pandas as pd
    ReadWriteChunkSize = 10000
    MaxDataRows = 50000
    DataReader = pd.read_json("A:\\Files\\Shares\\Downloads\\yelp_dataset\\yelp_dataset~\\review.json", lines = True, chunksize = ReadWriteChunkSize,
                             dtype={'review_id':str,
                                    'user_id':str,
                                    'business_id':str,
                                    'stars':"int8",
                                    'date':str,
                                    'text':str,
                                    'useful':"int8",
                                    'funny':"int8",
                                    'cool':"int8"
                                   },
                             )

    fileData = []
    startTime = datetime.now()
    for data in DataReader:
        fileData.append(data[["stars","text"]])
        print(str(len(fileData) * ReadWriteChunkSize), " - ", str(datetime.now() - startTime))
        if((len(fileData) * ReadWriteChunkSize) >= MaxDataRows):
            break

    print("\n\nFile Reading Complete")
    return pd.concat(fileData, axis = 0)

In [2]:
FileData = LoadYelpData()

10000  -  0:00:00.181888
20000  -  0:00:00.344473
30000  -  0:00:00.501120
40000  -  0:00:00.652570
50000  -  0:00:00.812928


File Reading Complete


In [3]:
FileData = FileData.rename(columns = {"stars":"StarRating", "text":"ReviewText"})

In [4]:
FileData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
StarRating    50000 non-null int8
ReviewText    50000 non-null object
dtypes: int8(1), object(1)
memory usage: 439.6+ KB


### Filter by star rating

In [5]:
# Drop rows containing excess data
def DropStars(FileData, Binary = False):
    oneStarRowCount = (FileData.loc[(FileData["StarRating"] <= 1), "StarRating"].count())
    endingRowCountsPerResult = {}
    if(Binary):
        endingRowCountsPerResult = {
            5 : oneStarRowCount,
            4 : 0,
            3 : 0,
            2 : 0,
            1 : oneStarRowCount,
        }
    else:
        endingRowCountsPerResult = {
            5 : oneStarRowCount,
            4 : oneStarRowCount,
            3 : oneStarRowCount,
            2 : oneStarRowCount,
            1 : oneStarRowCount,
        }
            
    
    for stars in range(2,6):
        FileData = DropRowsAtRandom(
            DataFrame = FileData,
            CategoryColumnName = "StarRating",
            ValueCountToReduce = stars,
            RemainingRows = endingRowCountsPerResult[stars],
        )
    return FileData

In [6]:
# Drop X star rows to selected number of rows
def DropRowsAtRandom(DataFrame, CategoryColumnName, RemainingRows, ValueCountToReduce = 0):
    columnIndexArray = DataFrame[(DataFrame[CategoryColumnName] == ValueCountToReduce)].index
    startingRowCount = len(columnIndexArray)

    if(startingRowCount > RemainingRows):
        import numpy as np
        np.random.seed(13)

        DataFrame = DataFrame.drop(
            np.random.choice(
                columnIndexArray,
                (startingRowCount - (RemainingRows)),
                replace = False,
            )
        )
    return DataFrame

In [7]:
FileData["StarRating"].value_counts()

5    22271
4    11062
1     7245
3     5467
2     3955
Name: StarRating, dtype: int64

In [8]:
FileData = DropStars(FileData, Binary = False)

In [9]:
FileData["StarRating"].value_counts()

5    7245
4    7245
1    7245
3    5467
2    3955
Name: StarRating, dtype: int64

In [10]:
FileData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31157 entries, 0 to 49999
Data columns (total 2 columns):
StarRating    31157 non-null int8
ReviewText    31157 non-null object
dtypes: int8(1), object(1)
memory usage: 517.3+ KB


### Save Dataset to File

In [11]:
FileSavePath = "YelpReviewData.csv"

In [12]:
FileData.to_csv(FileSavePath, index = False)

### Reload Dataset from Save File

In [13]:
import pandas as pd
FileDataReloaded = pd.read_csv(FileSavePath, dtype = {"StarRating": "int8", "ReviewText":"str"})

In [14]:
FileDataReloaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31157 entries, 0 to 31156
Data columns (total 2 columns):
StarRating    31157 non-null int8
ReviewText    31157 non-null object
dtypes: int8(1), object(1)
memory usage: 274.0+ KB
