# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from matplotlib.patches import ConnectionPatch

# Importing Datasets

In [None]:
df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')

# Visualization of Data

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
df_test.head()

In [None]:
# # Checking for Collinearity to avoid overfitting

# from statsmodels.stats.outliers_influence import variance_inflation_factor
# from statsmodels.tools.tools import add_constant


# df_train_with_const = add_constant(df_train) 


# vif_data = pd.DataFrame()
# vif_data["feature"] = df_train_with_const.columns
# vif_data["VIF"] = [variance_inflation_factor(df_train_with_const.values, i) for i in range(df_train_with_const.shape[1])]

# print(vif_data)

Checking collinearity helps in machine learning by identifying highly correlated features, which can negatively impact model performance. High collinearity can lead to multicollinearity, where redundant information affects the model's ability to interpret the importance of features, increases variance, and may cause overfitting. Reducing collinearity by removing or combining correlated features improves model accuracy, interpretability, and generalization.
    
    Variance Inflation Factor (VIF) is a measure used to detect the severity of multicollinearity in regression analysis. It quantifies how much the variance of a regression coefficient is inflated due to collinearity with other predictor variables.

Mathematically, VIF is calculated as:

                            VIF_i = 1 / (1-R_i ^ 2)
                          
Where, 
    VIF_i : Variance Inflation Factor for the ith predictor ,
    R_i ^ 2 : R ^ 2 value of the ith predictor.
    
A VIF value:
    1: No correlation between the predictor and other predictors ,
    VIF > 5 or 10: Indicates high multicollinearity, which could affect model performance.
   
Reducing VIF by removing or combining highly correlated features can improve model stability and interpretability.

In [None]:
df_train

In [None]:
# correlation_matrix = df_train.corr()
# plt.figure(figsize=(15, 6))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".1f", linewidths=0.2)
# plt.gcf().set_facecolor('#ffff')  
# plt.title('Correlation Matrix')
# plt.show()

In [None]:
# df_train.drop(columns=['id'], inplace=True)
# df_test.drop(columns=['id'], inplace=True)

In [None]:
df_test.isnull().sum()

In [None]:
y = df_train['FloodProbability'] 
df_train = df_train.drop(['FloodProbability'],axis=1)
X = df_train

In [75]:
y

0          0.445
1          0.450
2          0.530
3          0.535
4          0.415
           ...  
1117952    0.495
1117953    0.480
1117954    0.485
1117955    0.495
1117956    0.560
Name: FloodProbability, Length: 1117957, dtype: float64

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
print(df_test.columns.tolist())

['id', 'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']


In [60]:
df_test

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745300,1863257,5,4,8,3,5,4,4,5,5,...,5,6,1,3,5,6,4,4,6,6
745301,1863258,4,4,2,12,4,3,4,3,5,...,3,7,4,4,3,5,5,3,5,4
745302,1863259,5,7,9,5,5,6,7,5,5,...,6,11,3,11,4,5,9,5,5,4
745303,1863260,4,7,6,3,5,2,3,8,6,...,6,6,8,6,2,3,8,7,5,5


In [61]:
df_train

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,0,5,8,5,8,6,4,4,3,3,...,2,5,3,3,5,4,7,5,7,3
1,1,6,7,4,4,8,8,3,5,4,...,9,7,2,0,3,5,3,3,4,3
2,2,6,5,6,7,3,7,1,5,4,...,6,7,3,7,5,6,8,2,3,3
3,3,3,4,6,5,4,8,4,7,6,...,5,2,4,7,4,4,6,5,7,5
4,4,5,3,2,6,4,4,3,3,3,...,5,2,2,6,6,4,1,2,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117952,1117952,3,3,4,10,4,5,5,7,10,...,3,7,8,7,2,2,1,4,6,4
1117953,1117953,2,2,4,3,9,5,8,1,3,...,5,9,4,4,3,7,4,9,4,5
1117954,1117954,7,3,9,4,6,5,9,1,3,...,5,5,5,5,5,6,5,5,2,4
1117955,1117955,7,3,3,7,5,2,3,4,6,...,6,6,8,5,3,4,6,7,6,4


# Linear Regression

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Assuming X contains your training features and y contains the target variable
reg = LogisticRegression()
reg.fit(X_train, y_train)

# Ensure df_test contains only the features used in training
feature_columns = X_train.columns  # Get feature names from the training set

# Select only the feature columns in df_test
X_test_cleaned = X_test[feature_columns]

# Predicting the target on the cleaned test dataset
pred = reg.predict(X_test_cleaned)

# Create a new DataFrame that combines the test data and predictions
combined_results = pd.concat([df_test.reset_index(drop=True), 
                               pd.Series(pred, name='Flood_Probability')], 
                               axis=1)

# Display the combined results
combined_results


NameError: name 'X_train' is not defined

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate performance metrics
mse = mean_squared_error(y_test, pred)  # Mean Squared Error
mae = mean_absolute_error(y_test, pred)  # Mean Absolute Error
r2 = r2_score(y_test, pred)  # R-squared value

# Calculate error percentage
error_percent = (mae / y_test.mean()) * 100

# Display metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")
print(f"Error Percentage: {error_percent}%")

Mean Squared Error (MSE): 0.00040320658709055805
Mean Absolute Error (MAE): 0.01579247136376033
R-squared (R2): 0.8448773362840329
Error Percentage: 3.1304398518836716%


# Random Forest Regression

In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Assuming X_train, y_train, X_test, y_test are already defined
# Instantiate the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on X_test
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Optionally, you can print out the predictions and actual values side by side
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head())


Mean Squared Error: 0.0008964490804568145
         Actual  Predicted
1105809   0.560    0.51885
442591    0.575    0.52935
5434      0.510    0.53235
82966     0.530    0.53995
584893    0.590    0.57065


# KNearestNeighbor Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Assuming your DataFrame is named df and your target variable is known
# Example target variable, please adjust based on your context
# y = df['TargetColumnName']  # Replace with your target column name
# X = df.drop(columns=['TargetColumnName'])  # Drop the target column

# Initialize the KNeighborsRegressor model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust n_neighbors

# Train the model
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Optional: Display predictions alongside actual values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results.head())


In [100]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for missing values and basic statistics
print(train_data.info())
print(train_data.describe())
print(train_data.isnull().sum())

# Assuming 'FloodProbability' is the target variable
X_train = train_data.drop(['FloodProbability'], axis=1)
y_train = train_data['FloodProbability']

# Assuming test_data does not contain the target variable
X_test = test_data

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

# Train models and evaluate
accuracy_results = {}
mse_results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    # Note: Replace y_test with actual target labels if available for test data
    # Here, we're using train data for calculating accuracy and MSE
    train_pred = model.predict(X_train)
    accuracy = accuracy_score(y_train, train_pred)
    mse = mean_squared_error(y_train, train_pred)
    
    # Store results
    accuracy_results[name] = accuracy
    mse_results[name] = mse
    
    print(f"{name} - Accuracy: {accuracy:.4f}, MSE: {mse:.4f}")

# Plotting results
plt.figure(figsize=(14, 6))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.barh(list(accuracy_results.keys()), list(accuracy_results.values()), color='skyblue')
plt.xlabel('Accuracy')
plt.title('Model Accuracy Comparison')

# MSE plot
plt.subplot(1, 2, 2)
plt.barh(list(mse_results.keys()), list(mse_results.values()), color='salmon')
plt.xlabel('Mean Squared Error')
plt.title('Model MSE Comparison')

plt.tight_layout()
plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 22 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   id                               1117957 non-null  int64  
 1   MonsoonIntensity                 1117957 non-null  int64  
 2   TopographyDrainage               1117957 non-null  int64  
 3   RiverManagement                  1117957 non-null  int64  
 4   Deforestation                    1117957 non-null  int64  
 5   Urbanization                     1117957 non-null  int64  
 6   ClimateChange                    1117957 non-null  int64  
 7   DamsQuality                      1117957 non-null  int64  
 8   Siltation                        1117957 non-null  int64  
 9   AgriculturalPractices            1117957 non-null  int64  
 10  Encroachments                    1117957 non-null  int64  
 11  IneffectiveDisasterPreparedness  1117957 non-null 

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for missing values and basic statistics
print(train_data.info())
print(train_data.describe())
print(train_data.isnull().sum())

# Assuming 'FloodProbability' is the target variable
X_train = train_data.drop(['FloodProbability'], axis=1)
y_train = train_data['FloodProbability']

# Convert continuous target to discrete classes (example)
# Adjust the binning logic based on your data distribution
y_train = pd.cut(y_train, bins=[-float('inf'), 0.3, 0.6, float('inf')], labels=[0, 1, 2])

# Assuming test_data does not contain the target variable
X_test = test_data

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

# Train models and evaluate
accuracy_results = {}
mse_results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on train data
    train_pred = model.predict(X_train)
    
    # Calculate metrics
    accuracy = accuracy_score(y_train, train_pred)
    mse = mean_squared_error(y_train, train_pred)
    
    # Store results
    accuracy_results[name] = accuracy
    mse_results[name] = mse
    
    print(f"{name} - Accuracy: {accuracy:.4f}, MSE: {mse:.4f}")

# Plotting results
plt.figure(figsize=(14, 6))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.barh(list(accuracy_results.keys()), list(accuracy_results.values()), color='skyblue')
plt.xlabel('Accuracy')
plt.title('Model Accuracy Comparison')

# MSE plot
plt.subplot(1, 2, 2)
plt.barh(list(mse_results.keys()), list(mse_results.values()), color='salmon')
plt.xlabel('Mean Squared Error')
plt.title('Model MSE Comparison')

plt.tight_layout()
plt.show()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 22 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   id                               1117957 non-null  int64  
 1   MonsoonIntensity                 1117957 non-null  int64  
 2   TopographyDrainage               1117957 non-null  int64  
 3   RiverManagement                  1117957 non-null  int64  
 4   Deforestation                    1117957 non-null  int64  
 5   Urbanization                     1117957 non-null  int64  
 6   ClimateChange                    1117957 non-null  int64  
 7   DamsQuality                      1117957 non-null  int64  
 8   Siltation                        1117957 non-null  int64  
 9   AgriculturalPractices            1117957 non-null  int64  
 10  Encroachments                    1117957 non-null  int64  
 11  IneffectiveDisasterPreparedness  1117957 non-null 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Check for missing values and basic statistics
print(train_data.info())
print(train_data.describe())
print(train_data.isnull().sum())

# Assuming 'FloodProbability' is the target variable
X_train = train_data.drop(['FloodProbability'], axis=1)
y_train = train_data['FloodProbability']

# Convert continuous 'FloodProbability' to binary classes
# Set threshold, here it is 0.5
threshold = 0.5
y_train = (y_train >= threshold).astype(int)  # 0 if < threshold, 1 if >= threshold

# Assuming test_data does not contain the target variable
X_test = test_data

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

# Train models and evaluate
accuracy_results = {}
mse_results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict on train data
    train_pred = model.predict(X_train)
    
    # Calculate metrics
    accuracy = accuracy_score(y_train, train_pred)
    mse = mean_squared_error(y_train, train_pred)
    
    # Store results
    accuracy_results[name] = accuracy
    mse_results[name] = mse
    
    print(f"{name} - Accuracy: {accuracy:.4f}, MSE: {mse:.4f}")

# Plotting results
plt.figure(figsize=(14, 6))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.barh(list(accuracy_results.keys()), list(accuracy_results.values()), color='skyblue')
plt.xlabel('Accuracy')
plt.title('Model Accuracy Comparison')

# MSE plot
plt.subplot(1, 2, 2)
plt.barh(list(mse_results.keys()), list(mse_results.values()), color='salmon')
plt.xlabel('Mean Squared Error')
plt.title('Model MSE Comparison')

plt.tight_layout()
plt.show()
