In [88]:
import pandas as pd 
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [89]:
def fitModel(inputFile, outputFile):
    # Load your dataset
    df = pd.read_csv(inputFile)

    # Separate features and target
    features = df.drop('target', axis=1)
    target = df['target']

    # Identify categorical and numerical columns
    categorical_cols = features.select_dtypes(include=['object', 'category']).columns
    numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns

    # Create a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ])

    # Create the pipeline
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(random_state=42))
    ])

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=42)

    # Fit the model
    model_pipeline.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model_pipeline.predict(X_test)
    print(f'The accuracy for {inputFile} is {accuracy_score(y_test, y_pred)}')
    
    with open(outputFile, 'wb') as file:
        pickle.dump(model_pipeline, file)

In [90]:
fitModel('data_clean.csv', 'model1.pkl')
fitModel('cleveland_data_clean.csv', 'model2.pkl')
fitModel('heart_clean.csv', 'model3.pkl')

The accuracy for data_clean.csv is 0.8305084745762712
The accuracy for cleveland_data_clean.csv is 0.7
The accuracy for heart_clean.csv is 0.8315217391304348


In [79]:
with open('model.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

df = pd.read_csv('data_clean.csv')
X = df.drop('target', axis=1)
y = df['target']
y_pred = loaded_pipeline.predict(X)
print(f'The accuracy for data_clean.csv is {accuracy_score(y, y_pred)}')

The accuracy for data_clean.csv is 0.9625850340136054
