In [4]:
import pandas as pd
# Load data
data = pd.read_csv("survey.csv")
print(data.head())

             Timestamp  Age  Gender         Country state self_employed  \
0  2014-08-27 11:29:31   37  Female   United States    IL           NaN   
1  2014-08-27 11:29:37   44       M   United States    IN           NaN   
2  2014-08-27 11:29:44   32    Male          Canada   NaN           NaN   
3  2014-08-27 11:29:46   31    Male  United Kingdom   NaN           NaN   
4  2014-08-27 11:30:22   31    Male   United States    TX           NaN   

  family_history treatment work_interfere    no_employees  ...  \
0             No       Yes          Often            6-25  ...   
1             No        No         Rarely  More than 1000  ...   
2             No        No         Rarely            6-25  ...   
3            Yes       Yes          Often          26-100  ...   
4             No        No          Never         100-500  ...   

                leave mental_health_consequence phys_health_consequence  \
0       Somewhat easy                        No                      No   
1 

In [7]:
# 1. Handling Missing Values
# Check for missing values
missing_values = data.isnull().sum()
# Handle missing values (e.g., impute with the mean or mode)
data["Age"].fillna(data["Age"].mean(), inplace=True)
data["Gender"].fillna("Other", inplace=True)

In [8]:
# 2. Data Type Conversion
data["Timestamp"] = pd.to_datetime(data["Timestamp"])
data["Gender"] = data["Gender"].astype("category")

In [9]:
# 3. Remove Duplicates
data = data.drop_duplicates()

In [10]:
# 4. Dealing with Outliers
# Identify and deal with outliers in a specific numeric column (e.g., Age)
Q1 = data["Age"].quantile(0.25)
Q3 = data["Age"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data["Age"] >= lower_bound) & (data["Age"] <= upper_bound)]

In [13]:
# 5. Rename Columns
data = data.rename(columns={"self_employed": "SelfEmployed"})

In [14]:
# 6. Data Validation and Sanity Checks
# Example: Remove rows with invalid ages (e.g., negative ages)
data = data[data["Age"] >= 0]

In [15]:
# 7. Data Encoding (One-Hot Encoding)
data = pd.get_dummies(data, columns=["Gender"], prefix="Gender")

In [16]:
# 8. Data after cleaning
print(data.head())

            Timestamp  Age         Country state SelfEmployed family_history  \
0 2014-08-27 11:29:31   37   United States    IL          NaN             No   
1 2014-08-27 11:29:37   44   United States    IN          NaN             No   
2 2014-08-27 11:29:44   32          Canada   NaN          NaN             No   
3 2014-08-27 11:29:46   31  United Kingdom   NaN          NaN            Yes   
4 2014-08-27 11:30:22   31   United States    TX          NaN             No   

  treatment work_interfere    no_employees remote_work  ... Gender_male  \
0       Yes          Often            6-25          No  ...           0   
1        No         Rarely  More than 1000          No  ...           0   
2        No         Rarely            6-25          No  ...           0   
3       Yes          Often          26-100          No  ...           0   
4        No          Never         100-500         Yes  ...           0   

  Gender_male leaning androgynous Gender_msle Gender_non-binary  \
0

In [17]:
# 9. Save the Cleaned Data
data.to_csv("cleaned_mental_health_survey.csv", index=False)

In [18]:
# Load data from the cleaned file
data = pd.read_csv("cleaned_mental_health_survey.csv")
print(data.head())

             Timestamp  Age         Country state SelfEmployed family_history  \
0  2014-08-27 11:29:31   37   United States    IL          NaN             No   
1  2014-08-27 11:29:37   44   United States    IN          NaN             No   
2  2014-08-27 11:29:44   32          Canada   NaN          NaN             No   
3  2014-08-27 11:29:46   31  United Kingdom   NaN          NaN            Yes   
4  2014-08-27 11:30:22   31   United States    TX          NaN             No   

  treatment work_interfere    no_employees remote_work  ... Gender_male  \
0       Yes          Often            6-25          No  ...           0   
1        No         Rarely  More than 1000          No  ...           0   
2        No         Rarely            6-25          No  ...           0   
3       Yes          Often          26-100          No  ...           0   
4        No          Never         100-500         Yes  ...           0   

  Gender_male leaning androgynous Gender_msle Gender_non-binar