## Importing main libraries

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,mean_squared_error
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tkinter import Tk, Button, filedialog, Text, Listbox, Scrollbar, Toplevel, Label, Entry,LEFT
import threading

## Browse the data


In [2]:
df = None
target_column = None
target_column_regression = None
num_clusters = None
percent = None
def browse_file():
    global df
    filename = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv"), ("All files", "*.*")])
    if filename:
        try:
            df = pd.read_csv(filename)
            display_data("Data loaded successfully.")
        except Exception as e:
            display_data("Error loading data: " + str(e))

# Preprocessing

In [3]:
def head_of_data():
    return df.head()

def tail_of_data():
    return df.tail()

def info_about_data():
    return str(df.info)

def describe_data():
    return str(df.describe())

def shape_of_data():
    return df.shape

def columns_of_data():
    return df.columns.tolist()

def nans_in_data():
    return df.isna().sum()

def drop_columns(column):
    global df
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)
        display_data(f"Column '{column}' dropped successfully.")
    else:
        display_data(f"Column '{column}' not found.")


def drop_na():
    global df
    df.dropna(inplace=True)
    display_data("Nans dropped successfully.")

def simple_imputer(strategy):
    global df
    numerical_cols = df.select_dtypes(exclude='object').columns
    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy).fit(df[numerical_cols])
    df[numerical_cols] = imputer.transform(df[numerical_cols])
    display_data("Data imputed successfully.")


def label_encoder(column):
    global df
    if df[column].dtype == "object":
            encoder = LabelEncoder()
            df[column] = encoder.fit_transform(df[column])
            display_data("Data encoded successfully.")
    else:
        display_data("column must be an object datatype.")



def one_encoder(column):
    global df
    if df[column].dtype == "object":
        encoder = OneHotEncoder()
        encoded_data = encoder.fit_transform(df[[column]])
        # Convert sparse matrix to DataFrame
        encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out([column]))
        # Concatenate the encoded DataFrame with the original DataFrame
        df = pd.concat([df, encoded_df], axis=1)
        df.drop(column, axis=1, inplace=True)
        display_data("Data encoded successfully.")
    else:
        display_data("column must be an object datatype.")



def scaler():
    global df

    # Identify the target column by its data type
    target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0:]

    if len(target_column) == 0:
        # If no target column, scale the entire DataFrame
        scaler = StandardScaler()
        scaled = scaler.fit_transform(df)
        df_scaled = pd.DataFrame(scaled, columns=df.columns)
        df = df_scaled
    else:
        target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0]
        # Separate features and target
        X = df.drop(columns=[target_column])
        y = df[target_column]

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Create a new dataframe with scaled features and the target column
        df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
        df_scaled[target_column] = y.values

        # Update the global df
        df = df_scaled

    # Display a message
    display_data("Data scaled successfully.")






def min_max_scaler():
    global df

        # Identify the target column by its data type
    target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0:]

    if len(target_column) == 0:
            # If no target column, scale the entire DataFrame
            scaler = MinMaxScaler()
            scaled = scaler.fit_transform(df)
            df_scaled = pd.DataFrame(scaled, columns=df.columns)
            df = df_scaled
    else:
            target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0]
            # Separate features and target
            X = df.drop(columns=[target_column])
            y = df[target_column]

            # Scale features
            scaler = MinMaxScaler()
            X_scaled = scaler.fit_transform(X)

            # Create a new dataframe with scaled features and the target column
            df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
            df_scaled[target_column] = y.values

            # Update the global df
            df = df_scaled


    display_data("Data scaled successfully.")



# Converting the data into a DataFrame

def apply_pca(n):
    global df
    n=int(n)
    pca = PCA(n_components=n)
    target_column = df.select_dtypes(include=['object', 'category', 'int64', "int"]).columns[0]
    X = df.drop(columns=[target_column])
    y = df[target_column]
    principal_components = pca.fit_transform(X)

    # Create a DataFrame with the principal components
    pc_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(n)])
    pc_df[target_column] = y
    df = pc_df
    display_data("Data transformed successfully.")






def ApplyOverSampling():
    global df
    target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0]
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    smote=SMOTE()
    x_new,y_new=smote.fit_resample(X,y)
    df = pd.concat([x_new,y_new],axis=1)
    display_data("data transforme succesfully")

def ApplyUnderSampling():
    global df
    rus=RandomUnderSampler()
    target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0]
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    x_new,y_new=rus.fit_resample(X,y) 
    df = pd.concat([x_new,y_new],axis=1)
    display_data("data transforme succesfully")
                   




## Feature selection

In [4]:
def ApplySelectPercentile(percentile, score_func=f_classif):
    global df
    target_column = df.select_dtypes(include=['object', 'category', 'int64', 'int', "string"]).columns[0]
    X = df.drop(columns=[target_column])
    y = df[target_column]
    selector = SelectPercentile(score_func=score_func, percentile=percentile)
    x_new = selector.fit_transform(X, y)
    selected_columns = df.columns[:-1][selector.get_support()]
    x_new_df = pd.DataFrame(x_new, columns=selected_columns)
    df = pd.concat([x_new_df, pd.DataFrame(y)], axis=1)
    display("data transformed succsesfully")

## Classification

In [5]:
def split_data():
    global df, target_column
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

def logistic_regression():
    global df, target_column
    if target_column is None:
        display_data("Target column not set.")
        return
    X, y = split_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    result = f"Accuracy: {accuracy}\n\nConfusion Matrix:\n{conf_matrix}\n\nClassification Report:\n{class_report}"
    display_data(result)

def Random_forest():
    global df, target_column
    if target_column is None:
        display_data("Target column not set.")
        return
    X, y = split_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lr = RandomForestClassifier()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    result = f"Accuracy: {accuracy}\n\nConfusion Matrix:\n{conf_matrix}\n\nClassification Report:\n{class_report}"
    display_data(result)




def SVM():
    global df, target_column
    if target_column is None:
        display_data("Target column not set.")
        return
    X, y = split_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lr = SVC()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    result = f"Accuracy: {accuracy}\n\nConfusion Matrix:\n{conf_matrix}\n\nClassification Report:\n{class_report}"
    display_data(result)



# Regression

In [6]:
def split_data_regression():
    global df, target_column_regression
    X = df.drop(target_column_regression, axis=1)
    y = df[target_column_regression]
    return X, y



def linear_regression():
    global df, target_column_regression
    if target_column_regression is None:
        display_data("Target column not set.")
        return
    X, y = split_data_regression()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    result = f"Mean Squared Error: {error:.2f}"
    display_data(result)



def S_V_R():
    global df, target_column_regression

    if target_column_regression is None:
        display_data("Target column not set.")
        return

    X, y = split_data_regression()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    svr = SVR()

    svr.fit(X_train, y_train)


    y_pred = svr.predict(X_test)

    error = mean_squared_error(y_test, y_pred)

    result = f"Mean Squared Error: {error:.2f}"

    display_data(result)



def random_forest_regression():
    global df, target_column_regression


    if target_column_regression is None:
        display_data("Target column not set.")
        return


    X, y = split_data_regression()


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


    lr = RandomForestRegressor()


    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    error = mean_squared_error(y_test, y_pred)
    result = f"Mean Squared Error: {error:.2f}"
    display_data(result)


## Clustering

In [7]:
def clustering(n):
    global df
    kmeans = KMeans(n_clusters=n, init='k-means++', random_state=42)
    y_kmeans = kmeans.fit_predict(df)
    df['Cluster'] = y_kmeans


## GUI

In [8]:
def execute_selected_function(selected_option):
    try:
        if selected_option == "head_of_data":
            result = head_of_data()
        elif selected_option == "tail_of_data":
            result = tail_of_data()
        elif selected_option == "info_about_data":
            result = info_about_data()
        elif selected_option == "describe_data":
            result = describe_data()
        elif selected_option == "shape_of_data":
            result = shape_of_data()
        elif selected_option == "columns_of_data":
            result = columns_of_data()
        elif selected_option == "nans_in_data":
            result = nans_in_data()
        elif selected_option == "drop_columns":
            get_column_name()
            return
        elif selected_option == "drop_na":
            drop_na()
            return
        elif selected_option == "simple_imputer":
            get_strategy()
        elif selected_option == "label_encoder":
            get_column_to_encode()
        elif selected_option == "one_encoder":
            get_column_to_encode_one()
        elif selected_option == "scaler":
            scaler()
            return
        elif selected_option == "min_max_scaler":
            min_max_scaler()
            return
        elif selected_option == "PCA":
            for_pca()
            return
        elif selected_option == "Logistic Regression":
            logistic_regression()
            return
        elif selected_option == "Random forest":
            Random_forest()
            return

        elif selected_option == "SVM":
            SVM()
            return
        elif selected_option == "Linear Regression":
            linear_regression()
            return
        elif selected_option == "SVR":
            S_V_R()
            return
        elif selected_option == "Random under sampling":
            ApplyUnderSampling()
            return
        elif selected_option == "Random over sampling":
            ApplyOverSampling()
            return
        else:
            result = "Invalid option."

        display_data(result)
    except Exception as e:
        display_data("Error: " + str(e))

def get_column_name():
    column_window = Toplevel(root)
    column_window.title("Enter Column Name")

    label = Label(column_window, text="Enter column name:")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        column_name = entry.get()
        drop_columns(column_name)
        column_window.destroy()

    button = Button(column_window, text="Submit", command=submit)
    button.pack(pady=5)


def get_strategy():
    column_window = Toplevel(root)
    column_window.title("Enter Strategy Name")

    label = Label(column_window, text="Enter Strategy name:")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        strategy_name = entry.get()
        simple_imputer(strategy_name)
        column_window.destroy()

    button = Button(column_window, text="Submit", command=submit)
    button.pack(pady=5)


def get_column_to_encode():
    column_window = Toplevel(root)
    column_window.title("Enter Column Name")

    label = Label(column_window, text="Enter Column Name")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        column_name = entry.get()
        label_encoder(column_name)
        column_window.destroy()

    button = Button(column_window, text="Submit", command=submit)
    button.pack(pady=5)


def get_column_to_encode_one():
    column_window = Toplevel(root)
    column_window.title("Enter Column Name")

    label = Label(column_window, text="Enter Column Name")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        column_name = entry.get()
        one_encoder(column_name)
        column_window.destroy()

    button = Button(column_window, text="Submit", command=submit)
    button.pack(pady=5)


def get_target_column():
    column_window = Toplevel(root)
    column_window.title("Enter Target Column Name")

    label = Label(column_window, text="Enter target column name:")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        global target_column
        target_column = entry.get()
        if target_column in df.columns:
            display_data(f"Target column '{target_column}' set successfully.")
            column_window.destroy()
            Classification()
        else:
            display_data(f"Target column '{target_column}' not found.")

    submit_button = Button(column_window, text="Submit", command=submit)
    submit_button.pack(pady=10)





def get_target_column_regression():
    column_window = Toplevel(root)
    column_window.title("Enter Target Column Name for Regression")

    label = Label(column_window, text="Enter target column name:")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        global target_column_regression
        target_column_regression = entry.get()
        if target_column_regression in df.columns:
            display_data(f"Target column '{target_column_regression}' set successfully for Regression.")
            column_window.destroy()
            Regression()
        else:
            display_data(f"Target column '{target_column_regression}' not found.")
    submit_button = Button(column_window, text="Submit", command=submit)
    submit_button.pack(pady=10)


def get_percent():
    column_window = Toplevel(root)
    column_window.title("Enter the percent")

    label = Label(column_window, text="Enter the percent")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        global percent
        percent = entry.get()
        percent = int(percent)
        try:
            if percent > 0:
                display_data(f"percent is '{percent}' set successfully.")
                column_window.destroy()
                ApplySelectPercentile(percent)
        except Exception as e:
            display_data("Error: " + str(e))

    submit_button = Button(column_window, text="Submit", command=submit)
    submit_button.pack(pady=10)




def get_number_of_clusters():
    column_window = Toplevel(root)
    column_window.title("Enter Number of Clusters")

    label = Label(column_window, text="Enter number of clusters:")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        global num_clusters
        num_clusters = entry.get()
        num_clusters = int(num_clusters)
        try:
            if num_clusters > 0:
                display_data(f"Number of clusters '{num_clusters}' set successfully.")
                column_window.destroy()
                clustering(num_clusters)
            else:
                display_data("Number of clusters must be a positive integer.")
        except Exception as e:
            display_data("Error: " + str(e))

    submit_button = Button(column_window, text="Submit", command=submit)
    submit_button.pack(pady=10)




def for_pca():
    column_window = Toplevel(root)
    column_window.title("Enter The Number of Components")

    label = Label(column_window, text="Enter the number of components:")
    label.pack(pady=10)

    entry = Entry(column_window)
    entry.pack(pady=5)

    def submit():
        n = entry.get()
        apply_pca(n)
        column_window.destroy()

    submit_button = Button(column_window, text="Submit", command=submit)
    submit_button.pack(pady=10)

def Regression():
    algorithms = [
        "Linear Regression",
        "SVR",
        "Random Forest Regressor"
    ]
    options_window = Toplevel(root)
    options_window.title("Options")

    listbox = Listbox(options_window)
    listbox.pack(side="left", fill="y")

    for option in algorithms:
        listbox.insert("end", option)

    scrollbar = Scrollbar(options_window, orient="vertical")
    scrollbar.pack(side="right", fill="y")

    listbox.config(yscrollcommand=scrollbar.set)
    scrollbar.config(command=listbox.yview)

    def on_select(event):
        selected_index = listbox.curselection()
        if selected_index:
            selected_option = listbox.get(selected_index)
            threading.Thread(target=execute_selected_function, args=(selected_option,)).start()
            options_window.destroy()



    listbox.bind("<<ListboxSelect>>", on_select)
def Classification():
    algorithms = [
        "Logistic Regression",
        "Random forest",
        "SVM",
    ]
    options_window = Toplevel(root)
    options_window.title("Options")

    listbox = Listbox(options_window)
    listbox.pack(side="left", fill="y")

    for option in algorithms:
        listbox.insert("end", option)

    scrollbar = Scrollbar(options_window, orient="vertical")
    scrollbar.pack(side="right", fill="y")

    listbox.config(yscrollcommand=scrollbar.set)
    scrollbar.config(command=listbox.yview)

    def on_select(event):
        selected_index = listbox.curselection()
        if selected_index:
            selected_option = listbox.get(selected_index)
            threading.Thread(target=execute_selected_function, args=(selected_option,)).start()
            options_window.destroy()

    listbox.bind("<<ListboxSelect>>", on_select)










def Preprocessing():
    function_names = [
        "head_of_data",
        "tail_of_data",
        "info_about_data",
        "describe_data",
        "shape_of_data",
        "columns_of_data",
        "nans_in_data",
        "drop_columns",
        "drop_na",
        "simple_imputer",
        "label_encoder",
        "one_encoder",
        "scaler",
        "min_max_scaler",
        "PCA",
        "Random over sampling",
        "Random under sampling"
    ]

    options_window = Toplevel(root)
    options_window.title("Options")

    listbox = Listbox(options_window)
    listbox.pack(side="left", fill="y")

    for option in function_names:
        listbox.insert("end", option)

    scrollbar = Scrollbar(options_window, orient="vertical")
    scrollbar.pack(side="right", fill="y")

    listbox.config(yscrollcommand=scrollbar.set)
    scrollbar.config(command=listbox.yview)

    def on_select(event):
        selected_index = listbox.curselection()
        if selected_index:
            selected_option = listbox.get(selected_index)
            threading.Thread(target=execute_selected_function, args=(selected_option,)).start()
            options_window.destroy()

    listbox.bind("<<ListboxSelect>>", on_select)

def display_data(data):
    output_text.delete(1.0, "end")
    output_text.insert("end", data)

root = Tk()
root.title("Machine Learning Project")

browse_button = Button(root, text="Browse", command=browse_file, font=("Arial", 10, "bold"), foreground="#00FF00",
                       background="#000000", activebackground="#00FF00")
browse_button.pack(pady=10)

preprocess_button = Button(root, text="Preprocessing", command=Preprocessing, font=("Arial", 10, "bold"), foreground="#05313d",
                           background="#00ffff", activebackground="#05313d")
preprocess_button.pack(padx=20, pady=10)

target_button = Button(root, text="Classification", command=get_target_column, font=("Arial", 10, "bold"), foreground="#FF4500",
                       background="#000000", activebackground="#FF4500")
Regression_button = Button(root,
                          text="Regression",
                          command=get_target_column_regression,
                          font=("Arial", 10, "bold"),
                          foreground="cyan",
                          background="#000000",
                          activebackground="cyan")


Clustering_button = Button(root,
                          text="Clustering",
                          command=get_number_of_clusters,
                          font=("Arial", 10, "bold"),
                          foreground="blue",
                          background="#000000",
                          activebackground="blue")



feature_button = Button(root,
                          text="feature selection",
                          command=get_percent,
                          font=("Arial", 10, "bold"),
                          foreground="blue",
                          background="#000000",
                          activebackground="blue")

Regression_button.pack(side=LEFT, padx=10)
target_button.pack(side=LEFT, padx=30, pady=10)
Clustering_button.pack(side=LEFT, padx=10)
feature_button.pack(side=LEFT, padx=40, pady=10)
output_text = Text(root, height=40, width=80)
output_text.pack(pady=10)

root.mainloop()
