In [1]:

import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker to generate fake data
fake = Faker()

# Define the number of records
n_records = 1000

# Generate the dataset
data = {
    "Dam_ID": [fake.unique.uuid4() for _ in range(n_records)],
    "Dam_Health": np.random.choice(['Good', 'Average', 'Poor'], size=n_records, p=[0.6, 0.3, 0.1]),
    "Water_Level_m": np.random.normal(loc=100, scale=30, size=n_records),  # Normally distributed values
    "Water_Inflow_cms": np.random.normal(loc=500, scale=150, size=n_records),  # Normally distributed values
    "Water_Outflow_cms": np.random.normal(loc=480, scale=145, size=n_records),  # Normally distributed values
    "Reservoir_Capacity_percent": np.random.uniform(30, 100, size=n_records),
    "Sedimentation_Rate_m_per_year": np.random.normal(loc=0.5, scale=0.2, size=n_records),  # Normally distributed
    "Water_Quality_Index": np.random.uniform(50, 100, size=n_records),
    "Temperature_C": np.random.uniform(15, 35, size=n_records),
    "Humidity_percent": np.random.uniform(40, 90, size=n_records),
    "Wind_Speed_kmh": np.random.uniform(5, 50, size=n_records),
    "Soil_Moisture_percent": np.random.uniform(10, 50, size=n_records),
    "Evaporation_Rate_mm_per_day": np.random.uniform(2, 10, size=n_records),
    "Ponding_mm": np.random.uniform(0, 5, size=n_records),
    "Water_Temperature_C": np.random.uniform(15, 30, size=n_records),
}

# Convert to a DataFrame
df = pd.DataFrame(data)

# Add outliers to specific features
# Outliers in Water Level
outlier_indices = np.random.choice(df.index, size=10, replace=False)  # 10 outliers
df.loc[outlier_indices, "Water_Level_m"] = df["Water_Level_m"].max() + np.random.uniform(50, 100, 10)

# Outliers in Water Inflow
outlier_indices = np.random.choice(df.index, size=10, replace=False)
df.loc[outlier_indices, "Water_Inflow_cms"] = df["Water_Inflow_cms"].max() + np.random.uniform(200, 400, 10)

# Outliers in Water Outflow
outlier_indices = np.random.choice(df.index, size=10, replace=False)
df.loc[outlier_indices, "Water_Outflow_cms"] = df["Water_Outflow_cms"].min() - np.random.uniform(200, 400, 10)

# Outliers in Sedimentation Rate
outlier_indices = np.random.choice(df.index, size=5, replace=False)
df.loc[outlier_indices, "Sedimentation_Rate_m_per_year"] = df["Sedimentation_Rate_m_per_year"].max() + np.random.uniform(0.5, 1, 5)

# Outliers in Water Quality Index
outlier_indices = np.random.choice(df.index, size=5, replace=False)
df.loc[outlier_indices, "Water_Quality_Index"] = df["Water_Quality_Index"].min() - np.random.uniform(20, 40, 5)

# Display the first few rows of the updated dataset
print(df.head())


                                 Dam_ID Dam_Health  Water_Level_m  \
0  0bd4504a-97f8-4913-8987-15e07344c80b       Good     139.131985   
1  94043a5f-cd5b-4dc1-b1aa-2a5529699803    Average      82.445133   
2  385515c8-69f6-4c4b-b5c9-5ddbdd8f6cad    Average      70.644867   
3  710e060d-5329-4864-9073-b37959bd9b4e       Poor      62.763778   
4  ad2c0aa6-555a-4bd0-b673-c4a60fa3245d       Good      97.803745   

   Water_Inflow_cms  Water_Outflow_cms  Reservoir_Capacity_percent  \
0        427.431486         250.186361                   66.164824   
1        556.558974         476.120582                   49.923100   
2        707.424617         443.059176                   93.941067   
3        629.311983         512.741565                   81.352248   
4        577.761395         538.653399                   96.402240   

   Sedimentation_Rate_m_per_year  Water_Quality_Index  Temperature_C  \
0                       0.421067            82.076535      15.564566   
1                   

In [2]:
# Save the dataset to a CSV file
df.to_csv("dam_water_prediction_dataset_with_outliers.csv", index=False)

# Confirm that the dataset has been saved
print("Dataset saved successfully!")


Dataset saved successfully!


In [3]:
df = pd.read_csv("/content/employee_performance_dataset.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/employee_performance_dataset.csv'

In [None]:
df

Unnamed: 0,Employee_ID,Full_Name,Age,Gender,Department,Job_Role,Years_At_Company,Education_Level,Annual_Salary,Performance_Score,Work_Life_Balance_Score,Training_Hours,Satisfaction_Score,Last_Promotion_Years_Ago,Job_Level,Overtime,Leaves_Taken,Supervisor_Rating,Projects_Handled,Attrition_Flag
0,962d9569-02d3-42cb-9d01-844698a4590f,Melissa Cox,54.0,Female,Finance,Clerk,4.0,Master's,121265.0,3.454319,2.187229,3.0,3.625990,14.0,4.0,No,29.0,4.614073,0.0,1.0
1,27901d72-b6a3-4549-ad7c-5aed0310f837,Terry Stone,31.0,Male,HR,Clerk,19.0,Master's,46738.0,1.519777,4.987681,12.0,4.943648,12.0,1.0,No,30.0,1.767530,8.0,1.0
2,0a6a8991-282d-4d5b-a58d-4d0d7adf5a99,Nicole Santos,52.0,Male,Finance,Engineer,24.0,Bachelor's,35391.0,4.144875,3.932186,7.0,3.856474,1.0,4.0,Yes,15.0,2.987046,1.0,0.0
3,41afc1bf-95b0-4a6d-b360-e0f851686989,Erin Smith,39.0,Male,Marketing,Analyst,13.0,Bachelor's,95708.0,3.821232,4.634435,8.0,2.910590,4.0,5.0,Yes,5.0,1.572446,9.0,0.0
4,168113d1-fc56-4e98-9a83-a47fbef5c25d,Linda Newman,38.0,Male,Operations,Executive,17.0,Bachelor's,138155.0,4.795727,2.470572,73.0,2.776695,5.0,1.0,Yes,2.0,3.975244,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573,0479e59d-62cd-4fbf-8d60-38392d50be71,Chelsey Marquez,37.0,Female,Finance,Developer,20.0,Master's,107838.0,2.078083,3.024376,72.0,1.261000,4.0,3.0,Yes,13.0,1.884856,10.0,0.0
1574,4de13a9c-96bb-49c3-8063-96141316da83,Dakota Jones,24.0,Female,IT,Manager,11.0,Bachelor's,75351.0,3.682280,3.638638,86.0,2.458182,15.0,4.0,Yes,1.0,1.859549,2.0,1.0
1575,,,,,,,,,,,,,-1.000000,,,,,,,
1576,,,10.0,,,,,,,,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Dam_ID                         1000 non-null   object 
 1   Dam_Health                     1000 non-null   object 
 2   Water_Level_m                  1000 non-null   float64
 3   Water_Inflow_cms               1000 non-null   float64
 4   Water_Outflow_cms              1000 non-null   float64
 5   Reservoir_Capacity_percent     1000 non-null   float64
 6   Sedimentation_Rate_m_per_year  1000 non-null   float64
 7   Water_Quality_Index            1000 non-null   float64
 8   Temperature_C                  1000 non-null   float64
 9   Humidity_percent               1000 non-null   float64
 10  Wind_Speed_kmh                 1000 non-null   float64
 11  Soil_Moisture_percent          1000 non-null   float64
 12  Evaporation_Rate_mm_per_day    1000 non-null   fl

In [5]:
df.duplicated().sum()

np.int64(0)

##Data Cleaning


In [6]:
df.isnull().sum()

Dam_ID                           0
Dam_Health                       0
Water_Level_m                    0
Water_Inflow_cms                 0
Water_Outflow_cms                0
Reservoir_Capacity_percent       0
Sedimentation_Rate_m_per_year    0
Water_Quality_Index              0
Temperature_C                    0
Humidity_percent                 0
Wind_Speed_kmh                   0
Soil_Moisture_percent            0
Evaporation_Rate_mm_per_day      0
Ponding_mm                       0
Water_Temperature_C              0
dtype: int64

In [7]:
df.drop(["Employee_ID","Full_Name"],axis=1,inplace=True)

KeyError: "['Employee_ID', 'Full_Name'] not found in axis"

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578 entries, 0 to 1577
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       1571 non-null   float64
 1   Gender                    1571 non-null   object 
 2   Department                1568 non-null   object 
 3   Job_Role                  1570 non-null   object 
 4   Years_At_Company          1566 non-null   float64
 5   Education_Level           1563 non-null   object 
 6   Annual_Salary             1567 non-null   float64
 7   Performance_Score         1568 non-null   float64
 8   Work_Life_Balance_Score   1562 non-null   float64
 9   Training_Hours            1568 non-null   float64
 10  Satisfaction_Score        1569 non-null   float64
 11  Last_Promotion_Years_Ago  1569 non-null   float64
 12  Job_Level                 1566 non-null   float64
 13  Overtime                  1569 non-null   object 
 14  Leaves_T

In [None]:
df.dropna(inplace=True)

In [8]:
cat_cols = []
for col in df.columns:
  if df[col].dtype == 'object':
    cat_cols.append(col)

In [9]:
cat_cols

['Dam_ID', 'Dam_Health']

In [10]:
for i in cat_cols:
   print(f"Unqiue value ${i} = {df[i].unique()}")

Unqiue value $Dam_ID = ['0bd4504a-97f8-4913-8987-15e07344c80b'
 '94043a5f-cd5b-4dc1-b1aa-2a5529699803'
 '385515c8-69f6-4c4b-b5c9-5ddbdd8f6cad'
 '710e060d-5329-4864-9073-b37959bd9b4e'
 'ad2c0aa6-555a-4bd0-b673-c4a60fa3245d'
 '74ac3f95-d6df-4174-a1c0-16343900f42b'
 'eaf88d90-7080-4d25-8726-9e1158298a74'
 '2887990b-6980-4af9-9320-614ed563888d'
 'b5afe52f-ad74-4201-9d34-f7922bfbe8cd'
 '4b1703aa-780e-41a3-8333-88280bc3ca47'
 'f5696e58-f9b8-4641-9b5f-8fa8ababe378'
 'ff3a9bd4-dc09-4cf0-8b8e-ba0807a80760'
 '8901bfe9-2841-4dc2-9d4f-900fc61cc579'
 'bad3591c-c3ca-4ac3-b824-cbde8f10de6a'
 '37ec7a4d-cb44-4035-9649-ed3e8f22729b'
 '1d8faf9c-1fcc-47c5-8811-cbca0f48adc0'
 'c4c2557b-fe26-4f81-99ef-9321a54fc9f6'
 'eb233989-e3cd-4891-8873-e070d20e7360'
 'cb358ef6-d7f8-46c7-b5f1-dfec39963f53'
 '10eb285f-ae4b-4134-9284-fab20ebb8c43'
 'c8592995-06e6-4dea-8b46-a2cb653c49aa'
 '0249819a-7d86-4714-88bc-74c32e0f26e5'
 'd699fc9e-37dd-4924-b96b-d316ae982608'
 'aeb1e458-0346-4da3-b180-49b6f88f003d'
 '4d8566e3-53d9-4

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Dam_ID                         1000 non-null   int64  
 1   Dam_Health                     1000 non-null   int64  
 2   Water_Level_m                  1000 non-null   float64
 3   Water_Inflow_cms               1000 non-null   float64
 4   Water_Outflow_cms              1000 non-null   float64
 5   Reservoir_Capacity_percent     1000 non-null   float64
 6   Sedimentation_Rate_m_per_year  1000 non-null   float64
 7   Water_Quality_Index            1000 non-null   float64
 8   Temperature_C                  1000 non-null   float64
 9   Humidity_percent               1000 non-null   float64
 10  Wind_Speed_kmh                 1000 non-null   float64
 11  Soil_Moisture_percent          1000 non-null   float64
 12  Evaporation_Rate_mm_per_day    1000 non-null   fl

In [13]:
from sklearn.ensemble import RandomForestClassifier
