In [None]:
#1 importing + settings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_columns", 50)

In [None]:
#2 loading data
ROOT = Path.cwd()
if not (ROOT / "data").exists():
    ROOT = ROOT.parent

PATH = ROOT / "data" / "raw" / "reviews.csv"

dataframe = pd.read_csv(PATH, low_memory=False, nrows=200_000, encoding="utf-8")

dataframe.shape

In [None]:
#3 printing column labeling
print("columns: ", list(dataframe.columns))
dataframe.head(3)

In [None]:
#4 explicit selection based on cell 3

#columns:  ['app_id', 'app_name', 'review_text', 'review_score', 'review_votes']
TEXT = "review_text"
SCORE = "review_score" #âˆˆ{-1, 1}

raw_reviews_dataframe = dataframe
raw_reviews_dataframe[[TEXT, SCORE]].head()

In [None]:
#5 double checking label
raw_reviews_dataframe[SCORE].value_counts().sort_index()

In [None]:
#6 cleaning dataframe
reviews_dataframe = raw_reviews_dataframe.copy()

reviews_dataframe["text"] = reviews_dataframe[TEXT].astype(str).str.strip()
reviews_dataframe["label"] = reviews_dataframe[SCORE].map({-1: 0, 1: 1})

reviews_dataframe = reviews_dataframe[
    (reviews_dataframe["text"].str.len() > 0) &
    (reviews_dataframe["label"].notna())
].copy()

In [None]:
#7 organizing to pos and neg
reviews_dataframe["label"].value_counts(normalize=True)

In [None]:
#8 review text information

reviews_dataframe["text_len"] = reviews_dataframe["text"].str.len()

reviews_dataframe["text_len"].describe(percentiles=[0.5,0.75,0.9,0.95,0.99])

In [None]:
#9 graphing test
reviews_dataframe["text_len"].clip(upper=reviews_dataframe["text_len"].quantile(0.99)).hist(bins=50)

plt.title("review length")
plt.xlabel("char")
plt.ylabel("count")
plt.show()

In [None]:
#10 random sample (testing for later)

reviews_dataframe.sample(5)[
    ["app_name", "text", "label", "review_votes"]
]

In [None]:
#11 edge case inspection
reviews_dataframe.sort_values("text_len").head(10)[
    ["app_name", "text", "label", "text_len"]
]

In [None]:
Path("../data/processed").mkdir(parents=True, exist_ok=True)

reviews_dataframe[
    ["app_id", "app_name", "text", "label", "review_votes"]
].to_csv("../data/processed/reviews_clean.csv", index=False
)