# Yelp Review Data
Loading, parsing, and filtering review data json from the Yelp Dataset Challenge Round 13 for sentiment analysis training.

### Load Dataset from file

In [15]:
def LoadYelpData():
    from datetime import datetime
    import pandas as pd
    ReadWriteChunkSize = 100000
    MaxDataRows = 1000000
    DataReader = pd.read_json("Z:\\Downloads\\yelp_dataset\\yelp_dataset~\\review.json", lines = True, chunksize = ReadWriteChunkSize,
                             dtype={'review_id':str,
                                    'user_id':str,
                                    'business_id':str,
                                    'stars':"int8",
                                    'date':str,
                                    'text':str,
                                    'useful':"int8",
                                    'funny':"int8",
                                    'cool':"int8"
                                   },
                             )

    fileData = []
    startTime = datetime.now()
    for data in DataReader:
        fileData.append(data[["stars","text"]])
        print(str(len(fileData) * ReadWriteChunkSize), " - ", str(datetime.now() - startTime))
        if((len(fileData) * ReadWriteChunkSize) >= MaxDataRows):
            break

    print("\n\nFile Reading Complete")
    return pd.concat(fileData, axis = 0)

In [16]:
FileData = LoadYelpData()

100000  -  0:00:02.838064
200000  -  0:00:05.766999
300000  -  0:00:08.664844
400000  -  0:00:12.269671
500000  -  0:00:15.837154
600000  -  0:00:19.450968
700000  -  0:00:23.055793
800000  -  0:00:26.613450
900000  -  0:00:30.176524
1000000  -  0:00:33.744363


File Reading Complete


In [17]:
FileData = FileData.rename(columns = {"stars":"StarRating", "text":"ReviewText"})

In [18]:
FileData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
StarRating    1000000 non-null int8
ReviewText    1000000 non-null object
dtypes: int8(1), object(1)
memory usage: 8.6+ MB


### Filter by star rating

In [19]:
# Drop rows containing excess data
def DropStars(FileData, Binary = False):
    oneStarRowCount = (FileData.loc[(FileData["StarRating"] <= 1), "StarRating"].count())
    endingRowCountsPerResult = {}
    if(Binary):
        endingRowCountsPerResult = {
            5 : oneStarRowCount,
            4 : 0,
            3 : 0,
            2 : 0,
            1 : oneStarRowCount,
        }
    else:
        endingRowCountsPerResult = {
            5 : oneStarRowCount,
            4 : oneStarRowCount,
            3 : oneStarRowCount,
            2 : oneStarRowCount,
            1 : oneStarRowCount,
        }
            
    
    for stars in range(2,6):
        FileData = DropRowsAtRandom(
            DataFrame = FileData,
            CategoryColumnName = "StarRating",
            ValueCountToReduce = stars,
            RemainingRows = endingRowCountsPerResult[stars],
        )
    return FileData

In [20]:
# Drop X star rows to selected number of rows
def DropRowsAtRandom(DataFrame, CategoryColumnName, RemainingRows, ValueCountToReduce = 0):
    columnIndexArray = DataFrame[(DataFrame[CategoryColumnName] == ValueCountToReduce)].index
    startingRowCount = len(columnIndexArray)

    if(startingRowCount > RemainingRows):
        import numpy as np
        np.random.seed(13)

        DataFrame = DataFrame.drop(
            np.random.choice(
                columnIndexArray,
                (startingRowCount - (RemainingRows)),
                replace = False,
            )
        )
    return DataFrame

In [21]:
FileData["StarRating"].value_counts()

5    441856
4    219363
1    148519
3    109804
2     80458
Name: StarRating, dtype: int64

In [22]:
FileData = DropStars(FileData, Binary = False)

In [23]:
FileData["StarRating"].value_counts()

5    148519
4    148519
1    148519
3    109804
2     80458
Name: StarRating, dtype: int64

In [24]:
FileData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 635819 entries, 0 to 999998
Data columns (total 2 columns):
StarRating    635819 non-null int8
ReviewText    635819 non-null object
dtypes: int8(1), object(1)
memory usage: 10.3+ MB


### Save Dataset to File

In [25]:
FileSavePath = "YelpReviewData.csv"

In [26]:
FileData.to_csv(FileSavePath, index = False)

### Reload Dataset from Save File

In [27]:
import pandas as pd
FileDataReloaded = pd.read_csv(FileSavePath, dtype = {"StarRating": "int8", "ReviewText":"str"})

In [28]:
FileDataReloaded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635819 entries, 0 to 635818
Data columns (total 2 columns):
StarRating    635819 non-null int8
ReviewText    635819 non-null object
dtypes: int8(1), object(1)
memory usage: 5.5+ MB
