In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
dataset = pd.read_csv('Book2.csv')

# Assuming 'columns_to_drop' contains the names of columns you want to drop
columns_to_drop = ['Toposheet', 'Style', 'State', 'Rock Character', 'Distribution']

# Drop the specified columns from the DataFrame
dataset.drop(columns=columns_to_drop, inplace=True)

# Define categorical features
categorical_features = ['District', 'Material Type', 'Movement Type',
                        'Failure Plane', 'Activity', 'Failure Mechanism',
                        'Geomorphology', 'Weathering', 'Hydrology',
                        'Susceptibility', 'Risk', 'Triggering Factor','Geoscientific Causes']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical features
encoded_features = {}
for feature in categorical_features:
    encoded_features[feature] = label_encoder.fit_transform(dataset[feature])

# Now, encoded_features is a dictionary containing the encoded features


In [2]:
encoded_features

{'District': array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 7, 7, 3, 3, 7, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 3, 3, 3, 3, 3, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0,
        2, 2, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, 4, 0, 4, 4, 1, 1, 0,
        0, 0, 0, 1, 0, 0, 

In [3]:
import numpy as np

# Convert the dictionary of encoded features into a DataFrame
encoded_df = pd.DataFrame(encoded_features)

# Concatenate the encoded features DataFrame with any remaining numeric features
X = pd.concat([encoded_df, dataset.select_dtypes(include=[np.number])], axis=1)

# Extract the target variable (assuming it's named 'Target')
y = dataset['Geoscientific Causes']


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define preprocessing steps for numerical and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', StandardScaler())  # Standardize features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder())  # One-hot encode categorical features
])

# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess X_train and X_test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder_y = LabelEncoder()

# Encode the target variable (y)
y_encoded = label_encoder_y.fit_transform(y)

# Remove rows with NaNs in y_encoded and corresponding rows in X
indices_with_nan = np.isnan(y_encoded)
X_processed = X[~indices_with_nan]
y_processed = y_encoded[~indices_with_nan]


In [6]:
import pandas as pd

# Assuming X is your feature matrix
# Replace NaN values with 0
X_filled = X_processed.fillna(0)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_filled, y_processed, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.5774647887323944


In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Initialize Gradient Boosting classifier with Random Forest as base estimator
gb_rf_classifier = GradientBoostingClassifier(init=rf_classifier, n_estimators=100, random_state=42)

# Train the Gradient Boosting classifier
gb_rf_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = gb_rf_classifier.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Gradient Boosting with Random Forest Accuracy:", accuracy)


Gradient Boosting with Random Forest Accuracy: 0.6338028169014085


In [None]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize the GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=42)

# Initialize the BaggingClassifier with RandomForestClassifier as base estimator
bag = BaggingClassifier(base_estimator=rf, n_estimators=100, n_jobs=-1)

# Fit the model to the training data
bag.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = bag.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Bagging with Random Forest Accuracy:", accuracy)

# Initialize the BaggingClassifier with GradientBoostingClassifier as base estimator
bag = BaggingClassifier(base_estimator=gb, n_estimators=100, n_jobs=-1)

# Fit the model to the training data
bag.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = bag.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Bagging with Gradient Boosting Accuracy:", accuracy)



Bagging with Random Forest Accuracy: 0.5774647887323944


