## 0. Import library

In [2]:
import pandas as pd 
import numpy as np 

## 1. Read data

In [3]:
df = pd.read_json("Data/final.json")
df.head()

Unnamed: 0,id,movie_id,ratings,title,user_review,user_id_review,date_review,total_agree,is_spoilers,description
0,rw0002492,tt0012349,,A Must-See Silent Comedy,Snow Leopard,ur1174211,4 March 2002,87 out of 91,False,While perhaps not as celebrated now as some of...
1,rw6111597,tt0012349,9.0,Emphatically Outstanding...,Xstal,ur4103165,20 September 2020,29 out of 30,False,It takes your breath away over 100 years later...
2,rw1223756,tt0012349,9.0,One Of The Most Memorable Silent Films Ever,ccthemovieman-1,ur4445210,24 November 2005,70 out of 79,False,"Wow, is this a memorable film! It is one of th..."
3,rw1160671,tt0012349,9.0,Smiling and Tearing,Cineanalyst,ur1888886,29 August 2005,33 out of 35,True,"""The Kid"" is a powerfully emotional and wonder..."
4,rw1611019,tt0012349,,Chaplin understands how close slapstick is to ...,J. Spurlin,ur0679729,4 March 2007,47 out of 48,False,I've always thought there's a great beauty and...


## 2. Explore data

In [4]:
# info data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349280 entries, 0 to 349279
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              349280 non-null  object 
 1   movie_id        349280 non-null  object 
 2   ratings         319197 non-null  float64
 3   title           349280 non-null  object 
 4   user_review     349280 non-null  object 
 5   user_id_review  349280 non-null  object 
 6   date_review     349280 non-null  object 
 7   total_agree     349280 non-null  object 
 8   is_spoilers     349280 non-null  bool   
 9   description     349280 non-null  object 
dtypes: bool(1), float64(1), object(8)
memory usage: 24.3+ MB


Description data

|column|description| type|
|------|-----------|-----|
|id| id review| string|
|movie_id| id movie|string|
|ratings| rating of user for movie| float|
|movie_id| id movie|string|
|title| title of movie|string|
|user_review| name's user|string|
|user_id_review| id of user|string|
|date_review| date review|string|
|total_agree| ...|string|
|is_spoilers| ...|bool|
|description| description of review|string|

## 3. Processing data

In [5]:
def convert_total_agree_to_value(text:str):
    text = text.replace(',','')
    l = text.split(' ')
    if int(l[-1]) == 0:
        return 0
    return int(l[0]) / int(l[-1])
df['agree'] = df['total_agree'].apply(convert_total_agree_to_value)
## Transform type date_review column
df['date_review'] = pd.to_datetime(df['date_review'], format='%d %B %Y')
# drop duplicate
df.drop_duplicates()

Unnamed: 0,id,movie_id,ratings,title,user_review,user_id_review,date_review,total_agree,is_spoilers,description,agree
0,rw0002492,tt0012349,,A Must-See Silent Comedy,Snow Leopard,ur1174211,2002-03-04,87 out of 91,False,While perhaps not as celebrated now as some of...,0.956044
1,rw6111597,tt0012349,9.0,Emphatically Outstanding...,Xstal,ur4103165,2020-09-20,29 out of 30,False,It takes your breath away over 100 years later...,0.966667
2,rw1223756,tt0012349,9.0,One Of The Most Memorable Silent Films Ever,ccthemovieman-1,ur4445210,2005-11-24,70 out of 79,False,"Wow, is this a memorable film! It is one of th...",0.886076
3,rw1160671,tt0012349,9.0,Smiling and Tearing,Cineanalyst,ur1888886,2005-08-29,33 out of 35,True,"""The Kid"" is a powerfully emotional and wonder...",0.942857
4,rw1611019,tt0012349,,Chaplin understands how close slapstick is to ...,J. Spurlin,ur0679729,2007-03-04,47 out of 48,False,I've always thought there's a great beauty and...,0.979167
...,...,...,...,...,...,...,...,...,...,...,...
349275,rw9093803,tt9362722,5.0,Was expecting better,vallscar,ur148520705,2023-06-01,45 out of 107,False,"Story ended before I knew! It ended abruptly, ...",0.420561
349276,rw9101137,tt9362722,10.0,A+++++ A Masterpiece for Cinema,russellquisenberry,ur76726852,2023-06-04,4 out of 6,False,15/10!!!! Spider-Man: Across the Spider-Verse ...,0.666667
349277,rw9104768,tt9362722,8.0,"Solid, but not very satisfying on its own",roaches_97,ur141489368,2023-06-06,9 out of 20,False,Across the Spider Verse is a very solid movie....,0.450000
349278,rw9107052,tt9362722,9.0,Very nearly a masterpiece,cardsrock,ur24373984,2023-06-07,4 out of 8,False,It's truly incredible how good of a follow-up ...,0.500000


In [6]:
## split train test
df_train = df[~df['ratings'].isna()]
df_test = df[df['ratings'].isna()]

In [7]:
## get label
def get_label(ratings:float):
    if ratings <= 4:
        return 'NEG'
    elif ratings >= 8:
        return 'POS'
    return 'NEU'
df_train['label'] = df_train['ratings'].apply(get_label)
## create column is_new_user
df_train['is_new_user'] = ~(df_train['user_id_review'].duplicated())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['label'] = df_train['ratings'].apply(get_label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['is_new_user'] = ~(df_train['user_id_review'].duplicated())


In [8]:
df_train.to_csv("data/train.csv")
df_test.to_csv("data/test.csv")

In [10]:
import matplotlib.pyplot as plt

In [13]:
df.head()

Unnamed: 0,id,movie_id,ratings,title,user_review,user_id_review,date_review,total_agree,is_spoilers,description,agree
0,rw0002492,tt0012349,,A Must-See Silent Comedy,Snow Leopard,ur1174211,2002-03-04,87 out of 91,False,While perhaps not as celebrated now as some of...,0.956044
1,rw6111597,tt0012349,9.0,Emphatically Outstanding...,Xstal,ur4103165,2020-09-20,29 out of 30,False,It takes your breath away over 100 years later...,0.966667
2,rw1223756,tt0012349,9.0,One Of The Most Memorable Silent Films Ever,ccthemovieman-1,ur4445210,2005-11-24,70 out of 79,False,"Wow, is this a memorable film! It is one of th...",0.886076
3,rw1160671,tt0012349,9.0,Smiling and Tearing,Cineanalyst,ur1888886,2005-08-29,33 out of 35,True,"""The Kid"" is a powerfully emotional and wonder...",0.942857
4,rw1611019,tt0012349,,Chaplin understands how close slapstick is to ...,J. Spurlin,ur0679729,2007-03-04,47 out of 48,False,I've always thought there's a great beauty and...,0.979167


In [28]:
df['year'] = df['date_review'].dt.year
def chart_1(df):
    number_review_per_year = df.groupby('year')['id'].count()
    time_line = list(number_review_per_year.index.values)
    values = list(number_review_per_year.values)
    return [time_line, values]

def chart_2(df):
    df = df[df['user_id_review'].duplicated()]
    count_new_user = df.groupby('year')['user_id_review'].count()
    value = list(count_new_user.values)
    return value

def chart_3(df):
    df = df[~df['ratings'].isna()]
    avg_ratings_per_year = df.groupby('year')['ratings'].mean()
    return list(avg_ratings_per_year.values)

def get_number_user_film(df):
    number_user = df['user_id_review'].unique().shape[0]
    number_film = df['movie_id'].unique().shape[0]
    return [number_film, number_user]


In [33]:
df['user_id_review'].unique().shape[0]

202293