## 0. Import library

In [1]:
import pandas as pd 
import numpy as np 

## 1. Read data

In [2]:
df = pd.read_csv("../backend/data/reviews.csv")
df.head()

Unnamed: 0,id,movie_id,ratings,title,user_review,user_id_review,date_review,total_agree,is_spoilers,description
0,rw1007328,tt0004972,,The Conventional Wisdom is Partially Right,Snow Leopard,ur1174211,26 January 2005,275 out of 360,False,"The conventional wisdom about ""The Birth of a ..."
1,rw1375153,tt0004972,3.0,sad relic of racism,planktonrules,ur2467618,23 May 2006,98 out of 159,True,"First, I want to point out that just about eve..."
2,rw1139003,tt0004972,10.0,The Birth of an Art,Cineanalyst,ur1888886,31 July 2005,192 out of 321,True,"Before ""The Birth of a Nation,"" motion picture..."
3,rw3514927,tt0004972,8.0,Great techniques and a horrible message...,AlsExGal,ur15148330,30 July 2016,38 out of 60,False,...yet I still give it an 8/10 for all of the ...
4,rw1352391,tt0004972,,Is the historical importance of this film grea...,bbhlthph,ur2382545,26 April 2006,102 out of 161,False,"I saw this film at a small ""Art House"" theatre..."


In [3]:
df = df.iloc[:10000]

## 2. Explore data

In [4]:
# info data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              10000 non-null  object 
 1   movie_id        10000 non-null  object 
 2   ratings         8560 non-null   float64
 3   title           10000 non-null  object 
 4   user_review     10000 non-null  object 
 5   user_id_review  10000 non-null  object 
 6   date_review     10000 non-null  object 
 7   total_agree     10000 non-null  object 
 8   is_spoilers     10000 non-null  bool   
 9   description     10000 non-null  object 
dtypes: bool(1), float64(1), object(8)
memory usage: 713.0+ KB


Description data

|column|description| type|
|------|-----------|-----|
|id| id review| string|
|movie_id| id movie|string|
|ratings| rating of user for movie| float|
|movie_id| id movie|string|
|title| title of movie|string|
|user_review| name's user|string|
|user_id_review| id of user|string|
|date_review| date review|string|
|total_agree| ...|string|
|is_spoilers| ...|bool|
|description| description of review|string|

## 3. Processing data

In [5]:
def convert_total_agree_to_value(text:str):
    text = text.replace(',','')
    l = text.split(' ')
    if int(l[-1]) == 0:
        return 0
    return int(l[0]) / int(l[-1])
df['agree'] = df['total_agree'].apply(convert_total_agree_to_value)
## Transform type date_review column
df['date_review'] = pd.to_datetime(df['date_review'], format='%d %B %Y')
# drop duplicate
df.drop_duplicates()

Unnamed: 0,id,movie_id,ratings,title,user_review,user_id_review,date_review,total_agree,is_spoilers,description,agree
0,rw1007328,tt0004972,,The Conventional Wisdom is Partially Right,Snow Leopard,ur1174211,2005-01-26,275 out of 360,False,"The conventional wisdom about ""The Birth of a ...",0.763889
1,rw1375153,tt0004972,3.0,sad relic of racism,planktonrules,ur2467618,2006-05-23,98 out of 159,True,"First, I want to point out that just about eve...",0.616352
2,rw1139003,tt0004972,10.0,The Birth of an Art,Cineanalyst,ur1888886,2005-07-31,192 out of 321,True,"Before ""The Birth of a Nation,"" motion picture...",0.598131
3,rw3514927,tt0004972,8.0,Great techniques and a horrible message...,AlsExGal,ur15148330,2016-07-30,38 out of 60,False,...yet I still give it an 8/10 for all of the ...,0.633333
4,rw1352391,tt0004972,,Is the historical importance of this film grea...,bbhlthph,ur2382545,2006-04-26,102 out of 161,False,"I saw this film at a small ""Art House"" theatre...",0.633540
...,...,...,...,...,...,...,...,...,...,...,...
9995,rw6525538,tt0025316,10.0,A feel good romcom,funkybassgirl,ur23121544,2021-01-25,1 out of 1,False,I'm just starting to appreciate the world of c...,1.000000
9996,rw7702838,tt0025316,10.0,Masterpiece,theognis-80821,ur91212524,2021-12-30,0 out of 0,False,A beautifully structured script by Robert Risk...,0.000000
9997,rw2271768,tt0025316,8.0,"See It for Capra, Gable and Colbert",gelman@attglobal.net,ur1522352,2010-06-30,0 out of 0,False,"To modern sensibilities, ""It Happened One Nigh...",0.000000
9998,rw8330693,tt0025316,7.0,Very Nice Capra,thespeos,ur131560020,2022-07-13,0 out of 2,False,"I love director Frank Capra's work, but hadn't...",0.000000


In [6]:
## split train test
df_train = df[~df['ratings'].isna()]
df_test = df[df['ratings'].isna()]

In [7]:
## get label
def get_label(ratings:float):
    if ratings <= 4:
        return 'NEG'
    elif ratings >= 8:
        return 'POS'
    return 'NEU'
df_train['label'] = df_train['ratings'].apply(get_label)
## create column is_new_user
df_train['is_new_user'] = ~(df_train['user_id_review'].duplicated())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['label'] = df_train['ratings'].apply(get_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['is_new_user'] = ~(df_train['user_id_review'].duplicated())


In [8]:
df_train.to_csv("data/train.csv")
df_test.to_csv("data/test.csv")

In [10]:
import matplotlib.pyplot as plt

In [13]:
df.head()

Unnamed: 0,id,movie_id,ratings,title,user_review,user_id_review,date_review,total_agree,is_spoilers,description,agree
0,rw0002492,tt0012349,,A Must-See Silent Comedy,Snow Leopard,ur1174211,2002-03-04,87 out of 91,False,While perhaps not as celebrated now as some of...,0.956044
1,rw6111597,tt0012349,9.0,Emphatically Outstanding...,Xstal,ur4103165,2020-09-20,29 out of 30,False,It takes your breath away over 100 years later...,0.966667
2,rw1223756,tt0012349,9.0,One Of The Most Memorable Silent Films Ever,ccthemovieman-1,ur4445210,2005-11-24,70 out of 79,False,"Wow, is this a memorable film! It is one of th...",0.886076
3,rw1160671,tt0012349,9.0,Smiling and Tearing,Cineanalyst,ur1888886,2005-08-29,33 out of 35,True,"""The Kid"" is a powerfully emotional and wonder...",0.942857
4,rw1611019,tt0012349,,Chaplin understands how close slapstick is to ...,J. Spurlin,ur0679729,2007-03-04,47 out of 48,False,I've always thought there's a great beauty and...,0.979167


In [28]:
df['year'] = df['date_review'].dt.year
def chart_1(df):
    number_review_per_year = df.groupby('year')['id'].count()
    time_line = list(number_review_per_year.index.values)
    values = list(number_review_per_year.values)
    return [time_line, values]

def chart_2(df):
    df = df[df['user_id_review'].duplicated()]
    count_new_user = df.groupby('year')['user_id_review'].count()
    value = list(count_new_user.values)
    return value

def chart_3(df):
    df = df[~df['ratings'].isna()]
    avg_ratings_per_year = df.groupby('year')['ratings'].mean()
    return list(avg_ratings_per_year.values)

def get_number_user_film(df):
    number_user = df['user_id_review'].unique().shape[0]
    number_film = df['movie_id'].unique().shape[0]
    return [number_film, number_user]


In [33]:
df['user_id_review'].unique().shape[0]

202293