In [2]:
import pandas as pd
import numpy as np

# **Phase 1: Data Cleaning & Preprocessing**

**Step 1: Load Dataset & Verify Structure**

In [3]:
# Load dataset
df = pd.read_csv('dataset 1.csv')   

# Display basic info
print("✅ Dataset Loaded Successfully!")
print("📌 Shape of Data:", df.shape)
print("\n📝 First 5 Rows:")
print(df.head())

# Check for missing values
print("\n⚠️ Missing Values Per Column:")
print(df.isnull().sum())

# Check data types
print("\n🛠 Column Data Types:")
print(df.dtypes)


  df = pd.read_csv('dataset 1.csv')


✅ Dataset Loaded Successfully!
📌 Shape of Data: (4526654, 24)

📝 First 5 Rows:
   Index        pH      Iron   Nitrate    Chloride Lead      Zinc  \
0      0  8.332988  0.000083  8.605777  122.799772  0.0  3.434827   
1      1  6.917863  0.000081  3.734167  227.029851  0.0  1.245317   
2      2  5.443762  0.020106  3.816994  230.995630  0.0  0.528280   
3      3  7.955339  0.143988  8.224944  178.129940  0.0  4.027879   
4      4  8.091909  0.002167  9.925788  186.540872  0.0  3.807511   

            Color  Turbidity  Fluoride  ...  Chlorine     Manganese  \
0       Colorless   0.022683  0.607283  ...  3.708178  2.269945e-15   
1    Faint Yellow   0.019007  0.622874  ...  3.292038  8.024076e-07   
2    Light Yellow   0.319956  0.423423  ...  3.560224  7.007989e-02   
3  Near Colorless   0.166319  0.208454  ...  3.516907  2.468295e-02   
4    Light Yellow   0.004867  0.222912  ...  3.177849  3.296139e-03   

   Total Dissolved Solids  Source  Water Temperature  Air Temperature  \
0     

In [4]:
df=df.drop("Index",axis=1)

**Step 2.1: Fixing Data Types**

In [5]:
# Convert 'Lead' column to numeric (force errors='coerce' to convert non-numeric to NaN)
df["Lead"] = pd.to_numeric(df["Lead"], errors="coerce")

# Convert 'Month' to categorical 
df["Month"] = df["Month"].astype("category")

# Convert categorical text columns to category type
df["Source"] = df["Source"].astype("category")
df["Color"] = df["Color"].astype("category")

# Verify data types after conversion
print("✅ Updated Data Types:\n", df.dtypes)


✅ Updated Data Types:
 pH                         float64
Iron                       float64
Nitrate                    float64
Chloride                   float64
Lead                       float64
Zinc                       float64
Color                     category
Turbidity                  float64
Fluoride                   float64
Copper                     float64
Odor                       float64
Sulfate                    float64
Conductivity               float64
Chlorine                   float64
Manganese                  float64
Total Dissolved Solids     float64
Source                    category
Water Temperature          float64
Air Temperature            float64
Month                     category
Day                        float64
Time of Day                float64
Target                     float64
dtype: object


**Step 2.2: Handling Missing Values**

In [6]:
# Check missing values
missing_values = df.isnull().sum()

print("⚠️ Missing Values Per Column:\n", missing_values)
print("\n📌 Total Missing Values:", missing_values.sum())

# Display basic stats before handling missing values
print("\n📝 Dataset Shape Before Handling Missing Values:", df.shape)

⚠️ Missing Values Per Column:
 pH                         87993
Iron                       30268
Nitrate                    80327
Chloride                  133284
Lead                       20380
Zinc                      118656
Color                       4347
Turbidity                  37925
Fluoride                  143819
Copper                    151897
Odor                      135592
Sulfate                   149935
Conductivity              124734
Chlorine                   43995
Manganese                  83288
Total Dissolved Solids      1285
Source                     67087
Water Temperature         127378
Air Temperature            22753
Month                      72541
Day                        75838
Time of Day                86730
Target                         1
dtype: int64

📌 Total Missing Values: 1800053

📝 Dataset Shape Before Handling Missing Values: (4526654, 23)


In [7]:
# Drop rows where Target is missing (since it's only 1 row)
df = df.dropna(subset=['Target'])

# Fill missing values for numerical columns with median
num_cols = df.select_dtypes(include=['float64']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing values for categorical columns with mode (most frequent value)
cat_cols = df.select_dtypes(include=['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))

# Verify that missing values are handled
print("✅ Missing Values After Handling:")
print(df.isnull().sum())

# Display dataset shape after handling missing values
print("\n📌 Dataset Shape After Handling Missing Values:", df.shape)


✅ Missing Values After Handling:
pH                        0
Iron                      0
Nitrate                   0
Chloride                  0
Lead                      0
Zinc                      0
Color                     0
Turbidity                 0
Fluoride                  0
Copper                    0
Odor                      0
Sulfate                   0
Conductivity              0
Chlorine                  0
Manganese                 0
Total Dissolved Solids    0
Source                    0
Water Temperature         0
Air Temperature           0
Month                     0
Day                       0
Time of Day               0
Target                    0
dtype: int64

📌 Dataset Shape After Handling Missing Values: (4526653, 23)


**Step 2.3: Handling Invalid Values**

In [8]:
# Find negative values
invalid_values = df[num_cols][df[num_cols] < 0]

# Print results
print("⚠️ Invalid (Negative) Values Found:")
print(invalid_values.describe())


print("\n✅ Invalid Value Check Completed!")


⚠️ Invalid (Negative) Values Found:
        pH  Iron  Nitrate  Chloride  Lead  Zinc  Turbidity  Fluoride  Copper  \
count  0.0   0.0      0.0       0.0   0.0   0.0        0.0       0.0     0.0   
mean   NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   
std    NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   
min    NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   
25%    NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   
50%    NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   
75%    NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   
max    NaN   NaN      NaN       NaN   NaN   NaN        NaN       NaN     NaN   

       Odor  Sulfate  Conductivity  Chlorine  Manganese  \
count   0.0      0.0           0.0       0.0        0.0   
mean    NaN      NaN           NaN       NaN        NaN   
std     NaN      NaN           NaN       NaN        NaN   
min    

In [9]:
# Remove rows containing negative values in any numerical column
df = df[(df[num_cols] >= 0).all(axis=1)].copy()

# Verify if any negative values remain
negative_check = (df[num_cols] < 0).sum()

# Display confirmation
print("✅ Negative Values Removed Successfully!")

print("\n⚠️ Remaining Negative Values:")
print(negative_check)

# Check dataset shape after dropping invalid rows
print("\n📌 Dataset Shape After Removing Negative Values:", df.shape)


✅ Negative Values Removed Successfully!

⚠️ Remaining Negative Values:
pH                        0
Iron                      0
Nitrate                   0
Chloride                  0
Lead                      0
Zinc                      0
Turbidity                 0
Fluoride                  0
Copper                    0
Odor                      0
Sulfate                   0
Conductivity              0
Chlorine                  0
Manganese                 0
Total Dissolved Solids    0
Water Temperature         0
Air Temperature           0
Day                       0
Time of Day               0
Target                    0
dtype: int64

📌 Dataset Shape After Removing Negative Values: (4524526, 23)


**Step 2.3: Outlier Handling**

In [10]:
print(df.describe())

                 pH          Iron       Nitrate      Chloride           Lead  \
count  4.524526e+06  4.524526e+06  4.524526e+06  4.524526e+06   4.524526e+06   
mean   7.445999e+00  1.252322e-01  6.147248e+00  1.837081e+02   1.459541e-03   
std    8.741905e-01  4.740444e-01  3.211280e+00  6.703719e+01   3.205737e-02   
min    1.057113e+00  2.047587e-53  2.861727e-01  2.363919e+01   0.000000e+00   
25%    6.908848e+00  1.041383e-05  4.000400e+00  1.392692e+02  3.368981e-122   
50%    7.450047e+00  2.226640e-03  5.599301e+00  1.758433e+02   2.137893e-62   
75%    8.000353e+00  5.298038e-02  7.613478e+00  2.161728e+02   2.494635e-27   
max    1.291072e+01  1.935315e+01  9.639078e+01  1.507310e+03   5.844281e+00   

               Zinc     Turbidity      Fluoride        Copper          Odor  \
count  4.524526e+06  4.524526e+06  4.524526e+06  4.524526e+06  4.524526e+06   
mean   1.534867e+00  5.137011e-01  9.551685e-01  5.074587e-01  1.797618e+00   
std    1.522185e+00  9.136822e-01  8.07602

In [11]:
from scipy.stats.mstats import winsorize

# 1️⃣ Exclude only 'Target' column (binary column) from winsorization
winsorize_cols = [col for col in num_cols if col != 'Target']  # Use num_cols and exclude Target

# 2️⃣ Apply Winsorization (Capping at 1st & 99th Percentile)
for col in winsorize_cols:
    df[col] = winsorize(df[col], limits=[0.01, 0.01])  # Capping at 1st and 99th percentile

# ✅ Winsorization Applied!
print("✅ Winsorization Completed!")

print("📌 Final Dataset Shape:", df.shape)


✅ Winsorization Completed!
📌 Final Dataset Shape: (4524526, 23)


---

# **Outlier Handling Using Winsorization**

## **What is Winsorization?**
Winsorization is a **statistical transformation** technique used to reduce the effect of **extreme outliers** by **limiting** (capping) values instead of removing them. Unlike traditional outlier removal methods (e.g., Interquartile Range (IQR) method), Winsorization **does not remove data points** but instead **modifies extreme values** to a predefined threshold.

---

## **Why Use Winsorization Instead of Removing Outliers?**
- **Preserves data integrity**: Unlike outright removal, Winsorization ensures we **retain all data points** while preventing extreme values from distorting our analysis.
- **Prevents information loss**: In datasets where **outliers carry meaningful information**, removing them entirely could result in loss of critical insights.
- **Works well for skewed data**: Many real-world datasets, especially environmental and water quality data, contain **naturally occurring extreme values** that shouldn't be removed.

---

## **How Does Winsorization Work?**
- Winsorization replaces **extreme values** at both ends of the distribution **with the closest threshold values**.
- Instead of completely removing outliers, **values beyond a certain percentile are capped**.
- This ensures that **outliers do not dominate** the analysis while keeping their presence in the dataset.

---

## **What Does `[0.01, 0.01]` Mean?**
- The two values `[0.01, 0.01]` represent the **percentage of data** to be **Winsorized** at both ends of the distribution.
- **0.01 (1%) at the lower end**: This means that the **lowest 1% of values** are **replaced** with the **value at the 1st percentile**.
- **0.01 (1%) at the upper end**: The **highest 1% of values** are **replaced** with the value at the 99th percentile.
- The Winsorization range ensures that **only the most extreme outliers are modified**, keeping the data more representative of the underlying distribution.

---

## **Why Specifically the 1st and 99th Percentile?**
- The **1st and 99th percentile** were chosen as a **balanced approach**:
  - It **removes the extreme 1% tail on both ends** without affecting the majority of the data.
  - This is common in **environmental and water quality data**, where some natural variability is expected but extreme values may be due to sensor errors or rare contamination events.
  - It avoids **over-smoothing** the data, ensuring that useful information from higher percentiles (e.g., 5th and 95th) remains intact.

---

## **Summary of Our Winsorization Process**
- **We applied Winsorization to all numerical columns except ‘Target’** (since it is a binary classification variable).
- **Extreme values were capped at the 1st and 99th percentile**.
- **No rows were removed**, ensuring all data points remain available for analysis.
- **This helps reduce the effect of extreme outliers** while preserving meaningful trends in the dataset.

---

### **✅ Winsorization Completed Successfully!**
This ensures our dataset is **cleaned from extreme distortions** while retaining its overall distribution for accurate analysis and machine learning models.

---


In [11]:
print(df.describe())

  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(


                 pH          Iron       Nitrate      Chloride           Lead  \
count  4.524526e+06  4.524526e+06  4.524526e+06  4.524526e+06   4.524526e+06   
mean   7.446013e+00  1.107651e-01  6.104349e+00  1.830621e+02   6.188506e-05   
std    8.451137e-01  3.396725e-01  2.960492e+00  6.337548e+01   4.589861e-04   
min    5.037333e+00  1.200793e-16  1.645919e+00  7.357661e+01   0.000000e+00   
25%    6.908848e+00  1.041383e-05  4.000400e+00  1.392692e+02  3.368981e-122   
50%    7.450047e+00  2.226640e-03  5.599301e+00  1.758433e+02   2.137893e-62   
75%    8.000353e+00  5.298038e-02  7.613478e+00  2.161728e+02   2.494635e-27   
max    9.767686e+00  2.343167e+00  1.789541e+01  4.188014e+02   4.147569e-03   

               Zinc     Turbidity      Fluoride        Copper          Odor  \
count  4.524526e+06  4.524526e+06  4.524526e+06  4.524526e+06  4.524526e+06   
mean   1.515383e+00  4.964433e-01  9.453987e-01  4.987727e-01  1.797284e+00   
std    1.425508e+00  7.938897e-01  7.56536

**One-Hot Encoding**

In [12]:
# Categorical Columns
categorical_cols = ["Color", "Source", "Month"]

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)  # Drop first category to avoid dummy trap

# Check Updated Data
print("✅ One-Hot Encoding Completed!")
print("📌 Updated Dataset Shape:", df_encoded.shape)
print("📝 First 5 Rows After Encoding:\n", df_encoded.head())


✅ One-Hot Encoding Completed!
📌 Updated Dataset Shape: (4524526, 42)
📝 First 5 Rows After Encoding:
          pH      Iron   Nitrate    Chloride           Lead      Zinc  \
0  8.332988  0.000083  8.605777  122.799772   3.713298e-52  3.434827   
1  6.917863  0.000081  3.734167  227.029851   7.849262e-94  1.245317   
2  5.443762  0.020106  3.816994  230.995630   5.286616e-76  0.528280   
3  7.955339  0.143988  8.224944  178.129940  3.997118e-176  4.027879   
4  8.091909  0.002167  9.925788  186.540872  4.171069e-132  3.807511   

   Turbidity  Fluoride    Copper      Odor  ...  Month_December  \
0   0.022683  0.607283  0.144599  1.626212  ...           False   
1   0.019007  0.622874  0.437835  1.686049  ...           False   
2   0.319956  0.423423  0.431588  3.414619  ...           False   
3   0.166319  0.208454  0.239451  1.769302  ...           False   
4   0.004867  0.222912  0.616574  0.795310  ...           False   

   Month_February  Month_January  Month_July  Month_June  Month

In [13]:
df_encoded.to_csv("cleaned_dataset.csv", index=False)


# **Phase:2 Feature Selection**

**Information Gain Calculation**

In [11]:
#  Use the Correct DataFrame (After Encoding)
X = df_encoded.drop(columns=["Target"])  # All features except Target
y = df_encoded["Target"]  # Target variable

In [14]:
from sklearn.feature_selection import mutual_info_classif

# Compute Information Gain (IG)
ig_scores = mutual_info_classif(X, y, random_state=42, n_jobs=-1)  # Parallel Processing

# Create a DataFrame to Store IG Values
ig_df = pd.DataFrame({"Feature": X.columns, "Information Gain": ig_scores})

#  Sort Features by IG Score (Descending Order)
ig_df = ig_df.sort_values(by="Information Gain", ascending=False).reset_index(drop=True)

#  Display Results
print("✅ IG Calculation Completed!")
print(ig_df)


MemoryError: Unable to allocate 97.7 MiB for an array with shape (3201247, 4) and data type float64

In [15]:
# Save IG results to CSV
ig_df.to_csv("thesis_main_ig_calculation.csv", index=False)

print("✅ IG Calculation Results Saved Successfully as thesis_main_ig_calculation.csv!")


✅ IG Calculation Results Saved Successfully as thesis_main_ig_calculation.csv!


## **Paper Implementation (For Selecting IG Method)**

**Fixed Threshold Method**

In [16]:
# Use the 50th percentile (median) as the fixed threshold
fixed_threshold = np.percentile(ig_df["Information Gain"], 50)  

# Select features with IG >= fixed threshold
selected_features_fixed = ig_df.loc[ig_df["Information Gain"] >= fixed_threshold, "Feature"].values

# Display Results
print("\n✅ Features Selected by Fixed Threshold Method (Median-Based):")
print(selected_features_fixed)
print(f"📌 Number of Features Selected: {len(selected_features_fixed)}")



✅ Features Selected by Fixed Threshold Method (Median-Based):
['pH' 'Color_Near Colorless' 'Manganese' 'Turbidity' 'Chloride' 'Copper'
 'Odor' 'Color_Faint Yellow' 'Color_Yellow' 'Nitrate' 'Chlorine'
 'Fluoride' 'Iron' 'Total Dissolved Solids' 'Color_Light Yellow' 'Sulfate'
 'Source_Well' 'Time of Day' 'Source_Stream' 'Source_Lake' 'Source_River']
📌 Number of Features Selected: 21


**Standard Deviation-Based Threshold Approach**

In [17]:
# Calculate the threshold as the standard deviation of IG values
std_threshold = ig_df["Information Gain"].std()

# Select features with IG >= standard deviation threshold
selected_features_std = ig_df.loc[ig_df["Information Gain"] >= std_threshold, "Feature"].values

# Display Results
print("\n✅ Features Selected by Standard Deviation Threshold Method:")
print(selected_features_std)
print(f"📌 Number of Features Selected: {len(selected_features_std)}")



✅ Features Selected by Standard Deviation Threshold Method:
['pH' 'Color_Near Colorless' 'Manganese' 'Turbidity' 'Chloride' 'Copper'
 'Odor' 'Color_Faint Yellow' 'Color_Yellow' 'Nitrate' 'Chlorine'
 'Fluoride' 'Iron' 'Total Dissolved Solids' 'Color_Light Yellow' 'Sulfate'
 'Source_Well' 'Time of Day' 'Source_Stream' 'Source_Lake' 'Source_River'
 'Source_Reservoir' 'Source_Ground' 'Source_Spring' 'Zinc']
📌 Number of Features Selected: 25


**CBFS Method**

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Train Random Forest on the dataset
rf = RandomForestClassifier(n_estimators=100, max_features="sqrt", random_state=42, n_jobs=-1)
rf.fit(X, y)

# Apply CBFS: Select features with importance >= mean importance
cbfs = SelectFromModel(rf, threshold="mean", prefit=False)
cbfs.fit(X, y)  # Fit before selecting features
selected_features_cbfs = X.columns[cbfs.get_support()]

# Display Results
print("\n✅ Features Selected by CBFS Method:")
print(selected_features_cbfs)
print(f"📌 Number of Features Selected: {len(selected_features_cbfs)}")



✅ Features Selected by CBFS Method:
Index(['pH', 'Iron', 'Nitrate', 'Chloride', 'Zinc', 'Turbidity', 'Fluoride',
       'Copper', 'Odor', 'Sulfate', 'Chlorine', 'Manganese',
       'Total Dissolved Solids'],
      dtype='object')
📌 Number of Features Selected: 13


**FFT Method**

In [12]:
from scipy.fft import fft, ifft
from sklearn.feature_selection import mutual_info_classif
import numpy as np

# Step 1: Apply FFT transformation to the dataset (keeping real part)
X_fft = fft(X, axis=0).real  

# Step 2: Apply IFFT to bring the data back to original form
X_ifft = ifft(X_fft, axis=0).real  

# Step 3: Compute Information Gain (IG) on transformed data
information_gain_fft = mutual_info_classif(X_ifft, y, random_state=42, n_jobs=-1)

# Step 4: Compute standard deviation threshold
fft_threshold = np.std(information_gain_fft)  

# Step 5: Select features with IG >= threshold
selected_features_fft = X.columns[information_gain_fft >= fft_threshold]

print("\nFeatures Selected by FFT with Proposed Threshold:")
print(selected_features_fft)
print(f"Number of Features: {len(selected_features_fft)}")



Features Selected by FFT with Proposed Threshold:
Index(['Color_Faint Yellow', 'Color_Light Yellow', 'Color_Near Colorless',
       'Color_Yellow', 'Source_Ground', 'Source_Lake', 'Source_Reservoir',
       'Source_River', 'Source_Spring', 'Source_Stream', 'Source_Well',
       'Month_August', 'Month_December', 'Month_February', 'Month_January',
       'Month_July', 'Month_June', 'Month_March', 'Month_May',
       'Month_November', 'Month_October', 'Month_September'],
      dtype='object')
Number of Features: 22


## **Feature Selection Analysis Summary**  

We applied four different **Information Gain (IG)-based** feature selection methods, each emphasizing different types of features. Below is a structured analysis of their outcomes.  

---

### **🔍 Overview of Selected Features**  

| **Method**                 | **Selected Features** | **Key Characteristics** |
|---------------------------|---------------------|------------------------|
| **Fixed Threshold (Median-Based)** | 21 features | Focused on **chemical properties** (`pH`, `Iron`, `Nitrate`, etc.) with some categorical (`Color_*`, `Source_*`). |
| **Standard Deviation-Based Threshold** | 25 features | Similar to Fixed, but included additional **source-based** features (`Source_*`). |
| **CBFS (Random Forest-Based)** | 13 features | Strictly **chemical-based**, ignoring categorical/time-related variables. |
| **FFT-Based Selection** | 22 features | Selected mostly **categorical (Color, Source) & temporal (Month) features**, ignoring key chemical indicators. |

---

### **📌 Observations & Trends**  

#### **✅ What’s Being Prioritized?**  
✔ **Fixed & Std-Dev Thresholds:**  
   - Prioritize **core water quality indicators** (`pH`, `Iron`, `Nitrate`, `Turbidity`).  
   - Include some categorical features (`Color_*`, `Source_*`).  
✔ **CBFS:**  
   - Selects **only chemical properties**, emphasizing direct pollutant measurements.  
✔ **FFT:**  
   - Detects **seasonal patterns & categorical dependencies**, prioritizing `Month_*` and `Source_*` over chemical values.  

---

### **⚠️ What’s Being Ignored?**  
❌ **CBFS & Threshold-Based Methods:**  
   - **Completely ignore** time-based (`Month_*`) and source-based (`Source_*`) variables, possibly **overlooking seasonal variations**.  
❌ **FFT-Based Selection:**  
   - **Fails to capture key chemical indicators** (e.g., `pH`, `Iron`, `Nitrate`), likely because **chemical values don’t follow strong periodic patterns**.  

---

# **Phase 3: DWTM Paper Implement !**

---
**Now The plan is:**  

1️⃣ **Use all Feature Selection (FS) methods** we implemented.  
2️⃣ **Convert selected features** following the approach in the same way as the paper.  
3️⃣ **Apply models** on all feature sets to compare initial performance.  
4️⃣ **Analyze performance trends** and select the best FS method for further refinements.  

----

## **Standard Deviation-Based Threshold**

In [13]:
# Extract only selected features + Target variable
df_dwtm = df[selected_features_std.tolist() + ["Target"]]

# Display shape & first few rows for verification
print("\n✅ DWTM Data Prepared!")
print("📌 Updated Dataset Shape:", df_dwtm.shape)
print("📝 First 5 Rows:")
print(df_dwtm.head())


NameError: name 'selected_features_std' is not defined