# Cleaning roberta-labeled comments from 3-hour labeling run

In [1]:
# Imports
import pandas as pd

In [2]:
# Read data into pandas dataframe
df = pd.read_json("280_min_stretch_spam_dropped_roberta.json")

In [3]:
# Check the number of rows in the dataframe
len(df)

189762

In [4]:
# Check the unique values in the "score" column
df["score"].unique()

array([ 1.,  2.,  0., nan])

In [5]:
# Only keep the rows where "score" column is valid
df = df[df["score"].notnull()]

In [6]:
# Check the unique values in the "score" column
df["score"].unique()

array([1., 2., 0.])

In [7]:
# Check the number of rows in the dataframe
len(df)

189489

In [8]:
# Drop the "results" column
df = df.drop(columns=["results"])

In [9]:
# Check the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189489 entries, 0 to 189761
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   comment         189489 non-null  object 
 1   category        189489 non-null  object 
 2   channel         189489 non-null  object 
 3   video_url       189489 non-null  object 
 4   video_id        189489 non-null  object 
 5   channel_size    189489 non-null  int64  
 6   identity        189489 non-null  int64  
 7   expanded_text   189489 non-null  object 
 8   processed_text  189489 non-null  object 
 9   score           189489 non-null  float64
dtypes: float64(1), int64(2), object(7)
memory usage: 15.9+ MB


In [10]:
# Cast the "score" column as integers
df = df.astype({"score":"int64"})

# Check the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189489 entries, 0 to 189761
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   comment         189489 non-null  object
 1   category        189489 non-null  object
 2   channel         189489 non-null  object
 3   video_url       189489 non-null  object
 4   video_id        189489 non-null  object
 5   channel_size    189489 non-null  int64 
 6   identity        189489 non-null  int64 
 7   expanded_text   189489 non-null  object
 8   processed_text  189489 non-null  object
 9   score           189489 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 15.9+ MB


In [11]:
# Check the unique values in the "channel_size" column
df["channel_size"].unique()

array([0, 1])

In [12]:
# Reset the dataframe's index
df.reset_index(drop=True,inplace=True)

In [13]:
# Replace values in the "channel_size" column
for x in range(0,len(df)):
    if df["channel_size"][x] == 1:
        df["channel_size"][x] = "large"
    else:
        df["channel_size"][x] = "small"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["channel_size"][x] = "small"


In [14]:
# Check the unique values in the "channel_size" column
df["channel_size"].unique()

array(['small', 'large'], dtype=object)

In [15]:
# Check the unique values in the "identity" column
df["identity"].unique()

array([1, 0])

In [16]:
# Replace values in the "identity" column
for x in range(0,len(df)):
    if df["identity"][x] == 1:
        df["identity"][x] = "unknown"
    else:
        df["identity"][x] = "lgbt"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["identity"][x] = "unknown"


In [17]:
# Check the unique values in the "identity" column
df["identity"].unique()

array(['unknown', 'lgbt'], dtype=object)

In [18]:
# Check the unique values in the "category" column
df["category"].unique()

array(['Automobiles', 'Comedy', 'Entertainment', 'Film',
       'News and Politics', 'Gaming', 'Science and Technology', 'Shows',
       'Sports', 'Pets and Animals', 'Music', 'Travel', 'Problematic',
       'Education', 'Howto', 'Nonprofit', 'People'], dtype=object)

In [19]:
# Save cleaned dataframe to json
df.to_json("final_data_for_machine_learning.json")

In [20]:
# Read clean json into a pandas dataframe
df1 = pd.read_json("final_data_for_machine_learning.json")

In [21]:
# Check the dataframe info
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 189489 entries, 0 to 189488
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   comment         189489 non-null  object
 1   category        189489 non-null  object
 2   channel         189489 non-null  object
 3   video_url       189489 non-null  object
 4   video_id        189489 non-null  object
 5   channel_size    189489 non-null  object
 6   identity        189489 non-null  object
 7   expanded_text   189489 non-null  object
 8   processed_text  189489 non-null  object
 9   score           189489 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 15.9+ MB
