# Data Processing and Machine Learning in Python

This notebook covers data preprocessing, classification, and regression tasks using various Python libraries like `pandas`, `numpy`, and `scikit-learn`. 

## Importing Libraries ##

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
from time import sleep

## Converting String Values to Numeric ##

### Properties ###
- Finds out if any String Values are present in the dataset
- If the variance is low, it uses LabelEncoder to transform the values 
- If the variance is high, it uses OneHotEncoder to transform the values

In [10]:
label_encoders = {}
def stringToNumeric(df):
    label_encoders = {}
    onehot_encoders = {}
    print("\nChecking for String Values...")
    sleep(1)

    string_count = 0
    for column in df.columns:
        if df[column].dtype == 'object':
            string_count = df[column].apply(lambda x: isinstance(x, str)).sum()
            print(f"\nString Values in '{column}': {string_count}")
            
            if string_count > 0:
                unique_values = df[column].nunique()
                print(f"Unique Values in '{column}': {unique_values}")
                
                if unique_values < 10:
                    le = LabelEncoder()
                    df[column] = le.fit_transform(df[column])
                    label_encoders[column] = le
                    print(f"\n'{column}' encoded using LabelEncoder.")
                else:
                    ohe = OneHotEncoder(sparse=False, drop='first')
                    encoded_columns = ohe.fit_transform(df[[column]])
                    ohe_column_names = [f"{column}_{cat}" for cat in ohe.categories_[0][1:]]
                    encoded_df = pd.DataFrame(encoded_columns, columns=ohe_column_names, index=df.index)
                    df.drop(column, axis=1, inplace=True)
                    df = pd.concat([df, encoded_df], axis=1)
                    onehot_encoders[column] = ohe
                    print(f"\n'{column}' encoded using OneHotEncoder.")

    if string_count == 0:
        print(f"\nNo string values found in the Dataset")
    return df

## Handling Null Values ##
### Properties: ###
- Counts the number of Null Values in the Dataset
- If none, returns the Dataframe as it is
- If not null, and are either Float or Int values; Uses mean to fill them in
- If not null, and are Strings; Uses mode to fill them in

In [11]:
def nullToNumeric(df):
    null_counts = df.isnull().sum()
    total_nulls = null_counts.sum()
    print("\nChecking for Null Values...")
    sleep(1)

    if total_nulls > 0:
        print(f"\nTotal Number of Null Values: {total_nulls}")
        for column in df.columns:
            null_count = null_counts[column]
            if null_count > 0:
                print(f"\nThe number of null values in the Column '{column}' are: {null_count}")
                if df[column].dtype in ['float64', 'int64']:
                    fill_value = df[column].mean()
                    print(f"Filling nulls in '{column}' with mean: {fill_value:.2f}")
                else:
                    fill_value = df[column].mode()[0]
                    print(f"Filling nulls in '{column}' with mode: '{fill_value}'")
                df[column] = df[column].fillna(fill_value)
        print("\nNull values handled successfully!")
    else:
        print("\nNo null values found in the Dataset")
    return df

## Correlation Analysis ##
### Properties: ###
- Compares every feature with the target
- The feature with the highest correlation with the target is returned

In [12]:
def bestFeature(df, targetCol):
    print(f"Calculating correlation of features with the feature {targetCol}")
    sleep(1)
    correlations = df.corr()[targetCol].drop(targetCol)
    best_feature = correlations.abs().idxmax()
    max_correlation = correlations[best_feature]
    print(f"The feature best correlated with the target '{targetCol}' is '{best_feature}'")
    print(f"Correlation coefficient: {max_correlation:.4f}")
    return best_feature

## Feature Scaling ##
- Analyzes the dataset to determine the appropriate scaling method
- Excludes the target column
- Finds out if the features are scalable 
- If the range of values is large or outliers are present, MinMaxScaler is used else StandardScaler

In [13]:
def scaling(df, targetCol):
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("Analyzing the dataset to determine the appropriate scaling method...")
    sleep(1)

    # Select numeric columns excluding the target column
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    if targetCol in numeric_cols:
        numeric_cols.remove(targetCol)

    if not numeric_cols:
        print("No numeric columns to scale. Skipping scaling.")
        return df

    # Analyze the range of the numeric columns
    ranges = df[numeric_cols].max() - df[numeric_cols].min()

    if ranges.max() > 10:  # Use MinMaxScaler if the range of values is large or there are outliers
        print("\nUsing MinMaxScaler due to the wide range of values.")
        scaler = MinMaxScaler()
    else:  # Otherwise, use StandardScaler for standardization
        print("\nUsing StandardScaler for standardization.")
        scaler = StandardScaler()

    # Apply the chosen scaler
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    sleep(1)
    print("\nScaling complete!")
    print(f"Scaled Columns: {numeric_cols}")

    return df

## Applying The Classification Model ##
- Asks the user for the target column
- Asks the user whether they want to scale the dataset or not
- Separates the target feature and the best feature into 'y' & 'x' respectively
- Shows plots in order to visualize the dataset
- Uses either GaussianNB or CategoricalNB for the features
    - GaussianNB for Continuous values
    - CategoricalNB for Categorical values
- Splits the data into Training & Testing splits (20 / 80)
- Applies the Model
- Predicts the values in the Testing Split
- Uses Metrics to output the model's performance

In [14]:
def createClassification(df):
    targetCol = input("\nWhat is the Target Column?: ").strip()
    
    if targetCol not in df.columns:
        raise ValueError(f"Column {targetCol} does not exist, try again")

    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    scale_choice = None
    while scale_choice not in ['yes', 'no']:
        scale_choice = input("Do you want to scale the data? (yes/no): ").strip().lower()
        if scale_choice not in ['yes', 'no']:
            print("\nInvalid input. Please type 'yes' or 'no'.")

    if scale_choice == 'yes':
        df = scaling(df, targetCol)


    best_feature = bestFeature(df, targetCol)
    # SEPARATING THE TARGET COLUMN    
    x = df[best_feature]
    y = df[str(targetCol)]

    # Visualization
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("Visualizing the relationship between the best feature and the target...")
    sleep(0.5)
    plt.figure(figsize=(8, 6))

    if y.dtype in ['float64', 'int64']:  # If target is numeric
        plt.scatter(x, y, alpha=0.7, color='blue')
        plt.title(f"Scatter Plot of '{best_feature}' vs '{targetCol}'")
        plt.xlabel(best_feature)
        plt.ylabel(targetCol)
    else:  # If target is categorical
        sns.boxplot(x=y, y=x, palette="viridis")
        plt.title(f"Box Plot of '{best_feature}' by '{targetCol}'")
        plt.xlabel(targetCol)
        plt.ylabel(best_feature)

    plt.grid(alpha=0.3)
    plt.show()

    x = x.values.reshape(-1, 1)  # RESHAPING X INTO A 2D ARRAY

    # Check if the feature values are continuous or discrete
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    if np.issubdtype(df[best_feature].dtype, np.floating):  # Continuous data
        print("Feature values are continuous. Using Gaussian Naive Bayes.")
        nbModel = GaussianNB()
    else:  # Discrete data
        print("Feature values are discrete. Using Categorical Naive Bayes.")
        nbModel = CategoricalNB()

    # SPLITTING DATA
    xTrain, xTest, yTrain, yTest = tts(x, y, test_size=0.2, random_state=42)

    # APPLYING MODEL
    nbModel.fit(xTrain, yTrain)

    yPred = nbModel.predict(xTest)
    results = pd.DataFrame({'Actual': yTest, 'Predicted': yPred})
    print(results.head())

    #METRICS
    accuracy = accuracy_score(yTest, yPred)
    conf_matrix = confusion_matrix(yTest, yPred)
    class_report = classification_report(yTest, yPred, zero_division = 0)
    
    sleep(0.5)
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print(f"ACCURACY SCORE: {accuracy * 100:.2f}%")
    
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("CONFUSION MATRIX:")
    print(conf_matrix)
    
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("CLASSIFICATION REPORT:")
    print(class_report)
    print("\nNaive Bayes Classifier Model Trained!")

    return nbModel

## Applying The Regression Model ##
- Asks the user for the target column
- Asks the user whether they want to scale the dataset or not
- Separates the target feature and the best feature into 'y' & 'x' respectively
- Splits the data into Training & Testing splits (20 / 80)
- Finds out the Coefficient and Intercept
- Uses the LinearRegression Model
- Applies the Model
- Predicts the values in the Testing Split
- Uses Metrics (MSE & R2 Score) to output the model's performance
- Shows plots in order to visualize the dataset

In [15]:
def createRegression(df):
    targetCol = input("\nWhat is the Target Column?: ").strip()    
    
    if targetCol not in df.columns:
        raise ValueError(f"Column {targetCol} does not exist, try again")     

    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    scale_choice = None
    while scale_choice not in ['yes', 'no']:
        scale_choice = input("Do you want to scale the data? (yes/no): ").strip().lower()
        if scale_choice not in ['yes', 'no']:
            print("\nInvalid input. Please type 'yes' or 'no'.")

    if scale_choice == 'yes':
        df = scaling(df, targetCol)

    
    best_feature = bestFeature(df, targetCol)
    #SEPARATING THE TARGET COLUMN    
    x = df[best_feature].values.reshape(-1, 1)
    y = df[str(targetCol)]

    #TRAINING
    xTrain, xTest, yTrain, yTest = tts(x, y, test_size = 0.3, random_state = 42)

    model = LinearRegression()
    model.fit(xTrain, yTrain)

    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    # Display the coefficients
    print(f"Coefficient: {model.coef_[0]}")
    print(f"Intercept: {model.intercept_}\n")

    # Predict on the test set
    yPred = model.predict(xTest)

    results = pd.DataFrame({'Actual': yTest, 'Predicted': yPred})
    sleep(1)
    print(f"RESULTS: \n{results.head()}")

    mse = mean_squared_error(yTest, yPred)
    r2 = r2_score(yTest, yPred)

    sleep(0.5)
    print(f"\nMean Squared Error: {mse}")
    print(f"\nR-squared: {r2}")

    # Visualization
    print("\n--------------------------------------------------------------------------------------------------------------------------------------------------------------")
    print("Visualizing the relationship between the best feature and the target...")
    sleep(0.5)
    plt.figure(figsize=(8, 6))

    # Use the original x before reshaping for plotting
    plt.scatter(df[best_feature], df[targetCol], alpha=0.7, color='blue')
    plt.xlabel(best_feature)
    plt.ylabel(targetCol)
    plt.title(f"Scatter Plot of '{best_feature}' vs '{targetCol}'")
    plt.grid(True)
    plt.show()

    return model

## Taking User Inputs and Running the Program ##
- Loads the CSV File and analyzes it, using the '.info()' function 
- Calls the stringToNumeric & nullToNumeric Functions
- Asks the user to select between Classification & Regression
- Error checking methods implemented

In [None]:
try:
    df = pd.read_csv("iris.csv")
    print("CSV file successfully loaded!")
    sleep(0.3)
    print("\nAnalyzing the Dataset...")
    sleep(1)
    print()
    print(df.info())

    df = stringToNumeric(df)
    df = nullToNumeric(df)

    validChoice = False
    while not validChoice:
        choice = input("\nChoose:\n1. Classification\n2. Regression\n")
        match choice:
                case '1':
                    print("--------------------------------------------------------------------------------------------------------------------------------------------------------------")
                    classOutput = createClassification(df)
                    validChoice = True
                case '2':
                    print("--------------------------------------------------------------------------------------------------------------------------------------------------------------")
                    createRegression(df)
                    validChoice = True
                case _:
                    print("Incorrect Input, Try again")


except FileNotFoundError as e:
    print(f"Error: {e}")

except pd.errors.EmptyDataError:
    print("Error: The file is empty or not a valid CSV file.")

except ValueError as e:
    print(f"Error: {e}")

except Exception as e:
    print(f"An unexpected error occurred: {e}")