In [1]:
import numpy as np
import pandas as pd

## 1 Check Data Before Cleaning
### Take Tesla data as the sample because It has the largest amount of data, so it's expected to have more obvious null values

In [4]:
#### Load data
df_post_check = pd.read_csv("/Data_Raw/Tesla_post_info.csv")
df_comment_check = pd.read_csv("/Data_Raw/Tesla_comment_info.csv")

In [21]:
#### Inspect post data in terms of the size and missing values
print(df_post_check.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1343 entries, 0 to 1342
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      1343 non-null   int64  
 1   Post_Title      1343 non-null   object 
 2   Author          1343 non-null   int64  
 3   Date            1343 non-null   object 
 4   Post_Content    1342 non-null   object 
 5   Comment_Number  1275 non-null   float64
 6   Net_Likes       1343 non-null   int64  
 7   Views           1343 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 84.1+ KB
None


In [19]:
#### Inspect comment data in terms of the size and missing values
print(df_comment_check.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       707 non-null    int64 
 1   Comment_id       707 non-null    int64 
 2   Post_id          707 non-null    object
 3   Author           707 non-null    int64 
 4   Date             707 non-null    object
 5   Comment_Content  707 non-null    object
 6   Net_Likes        707 non-null    int64 
 7   Reply_id         707 non-null    object
dtypes: int64(4), object(4)
memory usage: 44.3+ KB
None


## 2 Iterate Data by Brands to Clean
### clean the post data first and then clean the comment data

In [22]:
#### Create the list of brand name as the search key
l_brand = ["Fiat", "Mini", "Tesla", "Pg", "VW"]
#### Create a loop to iterate brands and build up the path to the data files
# l_brand = ["Fiat"]
for i in l_brand:
# create paths for post and comment data
    path_post = "/Data_Raw/" + i + "_post_info.csv"
    df_post_ori = pd.read_csv(path_post)
    path_comment = "/Data_Raw/" + i + "_comment_info.csv"
    df_comment_ori = pd.read_csv(path_comment)

    # drop index first
    df_post_ori = df_post_ori.drop("Unnamed: 0", axis = 1)
    df_comment_ori = df_comment_ori.drop("Unnamed: 0", axis = 1)

    # clean the post data
    df_post_ori["Post_Content"] = df_post_ori["Post_Content"].fillna("None")
    df_post_ori["Comment_Number"] = df_post_ori["Comment_Number"].fillna(0)
    # clean the comment data
    df_comment_ori["Reply_id"] = df_comment_ori["Reply_id"].fillna(0)
    df_comment_ori = df_comment_ori.loc[df_comment_ori["Comment_Content"] != "None"]
    df_comment_ori = df_comment_ori.dropna()
    df_comment_ori = df_comment_ori.reset_index(drop = True)
    # Define the phrase to be ignored
    ignore_phrase = "Click to expand"
    # Cut out texts before the phrase (the replied comment)
    # for j in df_comment_ori["Comment_Content"]:
    #     if ignore_phrase in j:
    #         # Only save the part after "Click to expand" as the former one is repeated
    #         j = j.split(ignore_phrase)[-1]
    df_comment_ori["Comment_Content"] = df_comment_ori["Comment_Content"].apply(lambda x: x.split(ignore_phrase)[-1] if ignore_phrase in x else x)
    
    # create file names for data saving
    name_post = i + "_post_info_c1.csv"
    name_comment = i + "_comment_info_c1.csv"
    # save the files
    df_post_ori.to_csv(name_post, sep=",", index=True, header=True)
    df_comment_ori.to_csv(name_comment, sep=",", index=True, header=True)

### Check cleaned data

In [17]:
#### Load data
df_post_clean_1 = pd.read_csv("Fiat_post_info_c1.csv")
df_comment_clean_1 = pd.read_csv("Fiat_comment_info_c1.csv")

In [18]:
#### Inspect post data in terms of the size and missing values
print(df_post_clean_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      51 non-null     int64  
 1   Post_Title      51 non-null     object 
 2   Author          51 non-null     int64  
 3   Date            51 non-null     object 
 4   Post_Content    51 non-null     object 
 5   Comment_Number  51 non-null     float64
 6   Net_Likes       51 non-null     int64  
 7   Views           51 non-null     int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      51 non-null     int64  
 1   Post_Title      51 non-null     object 
 2   Author          51 non-null     int64  
 3   Date            51 non-null     object 
 4   Post

In [19]:
#### Inspect comment data in terms of the size and missing values
print(df_comment_clean_1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       730 non-null    int64  
 1   Comment_id       730 non-null    int64  
 2   Post_id          730 non-null    int64  
 3   Author           730 non-null    int64  
 4   Date             730 non-null    object 
 5   Comment_Content  730 non-null    object 
 6   Net_Likes        730 non-null    int64  
 7   Reply_id         305 non-null    float64
dtypes: float64(1), int64(5), object(2)
memory usage: 45.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       730 non-null    int64  
 1   Comment_id       730 non-null    int64  
 2   Post_id          730 non-null    int64  
 3   Author           730 non-null