## Import All Necessary Packages/Library

In [1]:
import pandas as pd

## Data Wrangling

### Gathering Data

In [2]:
df = pd.read_csv('data/riasec/data.csv', delimiter='\t')
df.head()

  df = pd.read_csv('data/riasec/data.csv', delimiter='\t')


Unnamed: 0,R1,R2,R3,R4,R5,R6,R7,R8,I1,I2,...,orientation,race,voted,married,familysize,uniqueNetworkLocation,country,source,major,Unnamed: 93
0,3,4,3,1,1,4,1,3,5,5,...,1,1,2,1,1,1,US,2,,
1,1,1,2,4,1,2,2,1,5,5,...,3,4,1,2,3,1,US,1,Nursing,
2,2,1,1,1,1,1,1,1,4,1,...,1,4,2,1,1,1,US,1,,
3,3,1,1,2,2,2,2,2,4,1,...,1,1,2,1,1,1,CN,0,,
4,4,1,1,2,1,1,1,2,5,5,...,3,1,2,1,4,1,PH,0,education,


### Assessing Data 

In [3]:
print(df.shape)

(145828, 94)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145828 entries, 0 to 145827
Data columns (total 94 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   R1                     145828 non-null  int64 
 1   R2                     145828 non-null  int64 
 2   R3                     145828 non-null  int64 
 3   R4                     145828 non-null  int64 
 4   R5                     145828 non-null  int64 
 5   R6                     145828 non-null  int64 
 6   R7                     145828 non-null  int64 
 7   R8                     145828 non-null  int64 
 8   I1                     145828 non-null  int64 
 9   I2                     145828 non-null  int64 
 10  I3                     145828 non-null  int64 
 11  I4                     145828 non-null  int64 
 12  I5                     145828 non-null  int64 
 13  I6                     145828 non-null  int64 
 14  I7                     145828 non-null  int64 
 15  

In [5]:
df.isna().sum()

R1                            0
R2                            0
R3                            0
R4                            0
R5                            0
                          ...  
uniqueNetworkLocation         0
country                      12
source                        0
major                     52874
Unnamed: 93              145827
Length: 94, dtype: int64

It appears that there are missing values in the 'country', 'major', and 'unnamed:93' columns

### Cleaning Data

Initially, we will clean the missing values in the 'unnamed: 93' column by employing a dropping method. This decision was made due to the lack of significant information contained within this column.

In [None]:
df = df.drop(columns=['unnamed: 93'])
df.head()

Unnamed: 0,R1,R2,R3,R4,R5,R6,R7,R8,I1,I2,...,religion,orientation,race,voted,married,familysize,uniqueNetworkLocation,country,source,major
0,3,4,3,1,1,4,1,3,5,5,...,7,1,1,2,1,1,1,US,2,
1,1,1,2,4,1,2,2,1,5,5,...,7,3,4,1,2,3,1,US,1,Nursing
2,2,1,1,1,1,1,1,1,4,1,...,7,1,4,2,1,1,1,US,1,
3,3,1,1,2,2,2,2,2,4,1,...,0,1,1,2,1,1,1,CN,0,
4,4,1,1,2,1,1,1,2,5,5,...,4,3,1,2,1,4,1,PH,0,education


Next, we'll impute the missing values in the 'country' column with the most frequent value observed in the data.

In [12]:
df.country.value_counts()

country
US    80579
MY     7841
CA     7256
SG     5769
GB     5533
      ...  
SS        1
LA        1
FK        1
NR        1
UZ        1
Name: count, Length: 188, dtype: int64

In [None]:
df['country'] = df['country'].fillna('Unknown')
df.isna().sum()

R1                           0
R2                           0
R3                           0
R4                           0
R5                           0
                         ...  
familysize                   0
uniqueNetworkLocation        0
country                      0
source                       0
major                    52874
Length: 93, dtype: int64

Finally, we will fill the missing values in the 'major' column with the category 'Unknown'. This is done to prevent bias during the clustering process

In [18]:
df['major'] = df['major'].fillna('Unknown')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145828 entries, 0 to 145827
Data columns (total 93 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   R1                     145828 non-null  int64 
 1   R2                     145828 non-null  int64 
 2   R3                     145828 non-null  int64 
 3   R4                     145828 non-null  int64 
 4   R5                     145828 non-null  int64 
 5   R6                     145828 non-null  int64 
 6   R7                     145828 non-null  int64 
 7   R8                     145828 non-null  int64 
 8   I1                     145828 non-null  int64 
 9   I2                     145828 non-null  int64 
 10  I3                     145828 non-null  int64 
 11  I4                     145828 non-null  int64 
 12  I5                     145828 non-null  int64 
 13  I6                     145828 non-null  int64 
 14  I7                     145828 non-null  int64 
 15  

It seems that everything is clear now

In [21]:
df.to_csv("data/riasec/cleaned_data.csv", index=False)