In [16]:
import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold, KFold
import matplotlib.pyplot as plt
from tqdm import tqdm

def file_exists(filepath):
    if os.path.exists(filepath):
        return True
    else:
        return False
    
def rename_filename(filepath, number=1):
    name = os.path.splitext(filepath)[0]
    extension = os.path.splitext(filepath)[1]

    name += "_" + str(number)
    filepath = name + extension
    if file_exists(filepath):
        return rename_filename(filepath=filepath, number=number+1)
    else:
        return filepath

def save_model(to_save, filepath):
    import pickle

    try:
        if file_exists(filepath=filepath):
            filepath = rename_filename(filepath=filepath)
        pickle.dump(to_save, open(filepath, 'wb'))
        print("Saved successfully")
        return True, filepath
    except Exception as e:
        print("Error during saving model:\n", e)
        return False, filepath


def choose_model(option_user, **params):
    if int(option_user) == 1:
        model_option = input("Which model do you want to use?                                                                              1 = LinearRegression 2 = PolynomialFeatures 3 = SVM - SVR                                                    4 =  RandomForestRegressor")
        option_user = int(model_option)
        if option_user == 1:
            from sklearn.linear_model import LinearRegression
            if params:
                for k,v in params.items():
                    model = LinearRegression(**v) 
                return model
            else:
                model = LinearRegression()
                return model

        if option_user == 2:
            from sklearn.preprocessing import PolynomialFeatures
            from sklearn.linear_model import LinearRegression
            if params:
                for k,v in params.items():
                    model = PolynomialFeatures(**v) 
                return model
            else:
                raise ValueError("Missing argument degree")
            
        if option_user == 3:
            from sklearn.svm import SVR
            if params:
                for k,v in params.items():
                    model = SVR(**v) 
                return model
            else:
                model = SVR()
                return model

        if option_user == 4:
            from sklearn.ensemble import RandomForestRegressor
            if params:
                for k,v in params.items():
                    model = RandomForestRegressor(**v) 
                return model
            else:
                model = RandomForestRegressor()
                return model

        return model 
        
    elif int(option_user) == 2:
        model_option = input("Which model do you want to use? 1 = LogisticRegression, 2 = svm - SVC, 3 =                                  KNeighborsClassifier 4 = RandomForestClassifier(), 5 = XGBClassifier()")
        
        option_user = int(model_option)

        if option_user == 1:
            from sklearn.linear_model import LogisticRegression
            if params:
                for k,v in params.items():
                    model = LogisticRegression(**v) 
                return model
            else:
                model = LogisticRegression()
                return model


        if option_user == 2:
            from sklearn import svm
            if params:
                for k,v in params.items():
                    model = svm.SVC(**v) 
                return model
            else:
                model = svm.SVC()
                return model

        if option_user == 3:
            from sklearn.neighbors import KNeighborsClassifier
            for k,v in params.items():
                model = KNeighborsClassifier(**v) 
                return model
            else:
                raise ValueError("Missing argument n_neighbors")

        if option_user == 4:
            from sklearn.ensemble import RandomForestClassifier
            if params:
                for k,v in params.items():
                    model = RandomForestClassifier(**v) 
                return model
            else:
                model = RandomForestClassifier()
                return model

        if option_user == 5:
            from xgboost import XGBClassifier
            for k,v in params.items():
                model = XGBClassifier(**v) 
                return model
            else:
                model = XGBClassifier()
                return model

#return model

def train_model(model, df, target_name):
    X = df.drop(target_name, 1).values
    y = df[target_name].values
    
    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4)
    print(X_train)
    
    kfold_train = input("Do you want cross validation? yes or no")
    if kfold_train.lower() != "yes":
        """
    Starting training process with all X_train data

        """
    
        if str(model).startswith("PolynomialFeatures"):
            
            X_poly = model.fit_transform(X_train, y_train)

            many_linear = (input('Enter the number of parameters you want to give for LinearRegression, if you dont want any, enter: no. '))
            param_list_linear = {}
            if many_linear.lower == "no":
                lin_reg_model = LinearRegression()
            else:
                for i in range(int(many_linear)):
                    data = input('Enter parameter & value separated by ":" ') 
                    temp = data.split(':') 
                    if temp[1].isdigit():
                        param_list_linear[temp[0]] = int(temp[1]) 
                    elif ("True" in temp[1])|("False" in temp[1]):
                        param_list_linear[temp[0]] = bool(temp[1])
                    else:
                        param_list_linear[temp[0]] = temp[1]
                if param_list_linear:
                    lin_reg_model = LinearRegression(**param_list_linear) 
                    

            model_trained = lin_reg_model.fit(X_poly, y_train)

            X_test_poly = model.fit_transform(X_test, y_test)
            accuracy = model_trained.score(X_test_poly, y_test)

        else:
            model_trained = model.fit(X_train, y_train)
            accuracy = model_trained.score(X_test, y_test)
    else:
        small_portions = input("Do you want to use cross validation in small steps? (for large datasets), put yes or no.")
        if small_portions == "no":
            """
            Starting the training with cross validation normally
            """
            if str(model).startswith("PolynomialFeatures"):
                    n_splits = int(input("Put number of n_splits:"))
                    n_repeats = int(input("Put the number of n_repeats:"))
                    k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=4)
                    val_score = []
                    train_score = []
                    
                    X_poly = model.fit_transform(X_train, y_train)

                    many_linear_1 = input('Enter the number of parameters you want to give for LinearRegression, if you dont want any, enter: no. ')
                    
                    param_list_linear = {}
                    if many_linear_1 == "no":
                        
                        lin_reg_model = LinearRegression()
                    else:
                        for i in range(int(many_linear_1)):
                            data = input('Enter parameter & value separated by ":" ') 
                            temp = data.split(':') 
                            if temp[1].isdigit():
                                param_list_linear[temp[0]] = int(temp[1]) 
                            elif ("True" in temp[1])|("False" in temp[1]):
                                param_list_linear[temp[0]] = bool(temp[1])
                            else:
                                param_list_linear[temp[0]] = temp[1]
                        if param_list_linear:
                            lin_reg_model = LinearRegression(**param_list_linear) 
                            
                    for i, (train, val) in enumerate(k_fold.split(X_poly)):
                        model_trained = lin_reg_model.fit(X_poly[train], y_train[train])
                        score_val = lin_reg_model.score(X_poly[val], y_train[val])
                        val_score.append(score_val)
                        score_train = lin_reg_model.score(X_poly[train], y_train[train])
                        train_score.append(score_train)
                    #model_trained = lin_reg_model.fit(X_poly, y_train)

                    X_test_poly = model.fit_transform(X_test, y_test)
                    accuracy = model_trained.score(X_test_poly, y_test)
            else:
                n_splits = int(input("Put number of n_splits:"))
                n_repeats = int(input("Put the number of n_repeats:"))
                k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=4)
                val_score = []
                train_score = []
                for i, (train, val) in enumerate(k_fold.split(X_train)):
                    model_trained = model.fit(X_train[train], y_train[train])
                    score_val = model.score(X_train[val], y_train[val])
                    val_score.append(score_val)
                    score_train = model.score(X_train[train], y_train[train])
                    train_score.append(score_train)

                accuracy = model_trained.score(X_test, y_test)

            
            print("showing the learning process")
            plt.plot(train_score, label="train")
            plt.plot(val_score, label="val", color="orange")
            plt.ylabel("score")
            plt.legend()
            plt.show()
        else:
            """
            Starting cross validation with small steps - for large datasets. 
            """
            if str(model).startswith("PolynomialFeatures"):
                val_score = []
                train_score = []
                scores_small_trains = []
                scores_small_vals = []
                scores_smalls_trains_iterations = []
                scores_smalls_vals_iterations = []

                
                n_splits = int(input("Put number of n_splits:"))
                n_repeats = int(input("Put the number of n_repeats:"))
                k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=4)
                n_small_splits = int(input("put number of n_splits for the small sections to train in cross validation"))
                kfold_small_trains = KFold(n_splits=n_small_splits, random_state=4)
                """
                Per model the warm_state parameter is different (if they even excist). Look for documentation how to apply                        warm state for cross validation learning. below it will ask if you want to apply partial_fit
                """
                partial_fit = input("apply partial_fit to model?, put yes or no")
    
                
                X_poly = model.fit_transform(X_train, y_train)

                many_linear_1 = input('Enter the number of parameters you want to give for LinearRegression, if you dont want any, enter: no. ')
                param_list_linear = {}
                if many_linear_1 == "no":
                    lin_reg_model = LinearRegression()
                else:
                    for i in range(int(many_linear_1)):
                        data = input('Enter parameter & value separated by ":" ') 
                        temp = data.split(':') 
                        if temp[1].isdigit():
                            param_list_linear[temp[0]] = int(temp[1]) 
                        elif ("True" in temp[1])|("False" in temp[1]):
                            param_list_linear[temp[0]] = bool(temp[1])
                        else:
                            param_list_linear[temp[0]] = temp[1]
                    if param_list_linear:
                        lin_reg_model = LinearRegression(**param_list_linear) 

                for i, (train, val) in enumerate(k_fold.split(X_poly)):
                    to_show_in_bar = ": " + str(i) + "/" + str(n_splits * n_repeats)
                    generator_val = kfold_small_trains.split(val)
                    

                    for i2,(_, small_train) in tqdm(enumerate(kfold_small_trains.split(train)), total=n_small_splits, desc="Small train  progress" + to_show_in_bar):
                        _, small_val = next(generator_val)
                            

                        if partial_fit.lower() == "yes":
                            model_trained = lin_reg_model.partial_fit(X_poly[small_train], y_train[small_train], classes=np.unique(y))
                        else:
                            model_trained = lin_reg_model.fit(X_poly[small_train], y_train[small_train])

                        score_small_train = model.score(X_poly[small_train], y_train[small_train])
                        scores_smalls_trains_iterations.append(score_small_train)
                        # val part
                        score_small_val = model.score(X_poly[small_val], y_train[small_val])
                        scores_smalls_vals_iterations.append(score_small_val)
                    
                        train_score.append(np.mean(scores_smalls_trains_iterations))
                        scores_small_trains = scores_small_trains + list(scores_smalls_trains_iterations)
                        scores_smalls_trains_iterations.clear()
                        val_score.append(np.mean(scores_smalls_vals_iterations))
                        scores_small_vals = scores_small_vals + list(scores_smalls_vals_iterations)
                        scores_smalls_vals_iterations.clear()

                    print("Iteration:", to_show_in_bar, "| Val_accuracy:", np.mean(val_score),                                          "| train_accuracy: ", np.mean(train_score), sep="~~~~~~")
                print("Trained finished!")

            else:
                val_score = []
                train_score = []
                scores_small_trains = []
                scores_small_vals = []
                scores_smalls_trains_iterations = []
                scores_smalls_vals_iterations = []

                
                n_splits = int(input("Put number of n_splits:"))
                n_repeats = int(input("Put the number of n_repeats:"))
                k_fold = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=4)
                n_small_splits = int(input("put number of n_splits for the small sections to train in cross                                      validation"))
                
                kfold_small_trains = KFold(n_splits=n_small_splits, random_state=4)
                """
                Per model the warm_state parameter is different (if they even excist). Look for documentation how to apply                        warm state for cross validation learning. below it will ask if you want to apply partial_fit"""
                partial_fit = input("apply partial_fit to model?, put yes or no")

                for i, (train, val) in enumerate(k_fold.split(X_train)):
                    to_show_in_bar = ": " + str(i) + "/" + str(n_splits * n_repeats)
                    generator_val = kfold_small_trains.split(val)

                    for i2,(_, small_train) in tqdm(enumerate(kfold_small_trains.split(train)), total=n_small_splits, desc="Small train  progress" + to_show_in_bar):
                        _, small_val = next(generator_val)
                                

                        if partial_fit.lower() == "yes":
                            model_trained = model.partial_fit(X_train[small_train], y_train[small_train], classes=np.unique(y))
                        else:
                            model_trained = model.fit(X_train[small_train], y_train[small_train])

                        score_small_train = model.score(X_train[small_train], y_train[small_train])
                        scores_smalls_trains_iterations.append(score_small_train)
                        # val part
                        score_small_val = model.score(X_train[small_val], y_train[small_val])
                        scores_smalls_vals_iterations.append(score_small_val)
                        
                        train_score.append(np.mean(scores_smalls_trains_iterations))
                        scores_small_trains = scores_small_trains + list(scores_smalls_trains_iterations)
                        scores_smalls_trains_iterations.clear()
                        val_score.append(np.mean(scores_smalls_vals_iterations))
                        scores_small_vals = scores_small_vals + list(scores_smalls_vals_iterations)
                        scores_smalls_vals_iterations.clear()

                        print("Iteration:", to_show_in_bar, "| Val_accuracy:", np.mean(val_score), "| train_accuracy: ",                                        np.mean(train_score), sep="~~~~~~")
                print("Trained finished!")
                    
            accuracy = model_trained.score(X_test, y_test)
            
            print("showing the learning process")
            plt.plot(train_score, label="train")
            plt.plot(val_score, label="val", color="orange")
            plt.ylabel("score")
            plt.legend()
            plt.show()


        
    return model_trained, accuracy

'''
for regression: 
    option 1 = LinearRegression
    option 2 = PolynomialFeatures
    option 3 = SVM - SVR
    option 4 = RandomForestRegressor
for classification: 
    option 1 = LogisticRegression
    option 2 = KNeighborsClassifier
    option 3 = svm - SVC
    option 4 = RandomForestClassifier()
    option 5 = XGBClassifier()
'''
def main(df):
    choice = input("What type of problem: 1 for regression or 2 for classification?")
    params = input("Enter YES in case you want to enter a dictionary of params, if not neccesary put NO") 
    target = input("What is the target column?")

    if params.lower() == "no":
        model = choose_model(option_user=choice)
        model_trained, accuracy = train_model(model=model, df=df, target_name=target)
    else:
        many = int(input('Enter the number of parameters you want to give: '))
        param_list = {}
        for i in range(many):
            data = input('Enter parameter & value separated by ":" ') 
            temp = data.split(':') 
            if temp[1].isdigit():
                param_list[temp[0]] = int(temp[1]) 
            elif ("True" in temp[1])|("False" in temp[1]):
                param_list[temp[0]] = bool(temp[1])
            else:
                param_list[temp[0]] = temp[1]

        model = choose_model(option_user=choice, params=param_list)
        model_trained, accuracy = train_model(model=model, df=df, target_name=target)

    import time
    print("score of model:", accuracy)
    

    time.sleep(3.5)    # pause 3.5 seconds

    save = input("Do you want to save the model?, put yes or no")

    if save.lower() == "yes":
        filepath = input("Put the filepath where you want to save the model as following: name_file.sav ")
        save_model(to_save=model_trained, filepath=filepath)
    
    
    return model_trained
    


    



  

In [18]:
import pandas as pd
df = pd.read_csv("C:\\Users\\Roxan\\OneDrive\\Documentos\\My_map_2\\Data-science-bootcamp\\Curso\\week_10\\day1\\exercise\\train_set.csv")

In [19]:
df.drop("0",1,inplace = True)

In [20]:

#df_data = df.drop("17", 1)
from sklearn.preprocessing import LabelEncoder
#.drop("17", 1)

objList = df.select_dtypes(include = "object").columns
print (objList)
le = LabelEncoder()

for feat in objList:
    df[feat] = le.fit_transform(df[feat].astype(str))

Index(['2', '4', '6', '7', '8', '12', '13', '15', '17'], dtype='object')


In [23]:
main(df=df)

[[2.600e+01 1.000e+00 2.000e+00 ... 8.000e+00 7.000e+00 3.047e+03]
 [2.900e+01 0.000e+00 4.000e+00 ... 2.000e+00 2.000e+00 4.426e+03]
 [1.800e+01 3.000e+00 1.300e+01 ... 3.000e+00 3.000e+00 3.766e+03]
 ...
 [3.000e+00 2.000e+00 3.000e+00 ... 4.000e+00 3.000e+00 5.319e+03]
 [1.500e+01 2.000e+00 5.000e+00 ... 4.000e+00 2.000e+00 6.266e+03]
 [1.500e+01 2.000e+00 5.000e+00 ... 1.200e+01 7.000e+00 2.988e+03]]
Small train  progress: 0/20:   0%|          | 0/10 [00:00<?, ?it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [6]:
import numpy as np
from sklearn import datasets 
cancer = datasets.load_breast_cancer() 

df_cancer = pd.DataFrame(data= np.c_[cancer['data'], cancer['target']],
                     columns= list(cancer['feature_names']) + ['target'] )

df_cancer

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0.0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0.0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0.0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0.0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0.0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0.0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0.0
