In [31]:
from deepchecks.tabular import Dataset
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from deepchecks.tabular.suites import data_integrity
from joblib import dump
import pandas as pd
import os

In [41]:
# Load sklearn dataframe
def load_Empdata_df(filename):
    dataset_dir = ''
    # Define the path to the "empfile.csv" file within the "dataset" directory
    file_path = os.path.join(dataset_dir, filename)
    df = pd.read_csv(file_path)
    return df

In [42]:
# Split Dataframe
def split_dataframe():
    df = load_Empdata_df('Train-employee-salary.csv')
    X = df.iloc[1:,:-1]
    y = df.iloc[1:,-1]
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [34]:
X_train, X_test, y_train, y_test = split_dataframe()

In [35]:
X_train.head()

Unnamed: 0,id,groups,age,healthy_eating,active_lifestyle
779,779,O,59,4.0,7.0
287,287,B,25,4.0,1.0
166,166,AB,59,6.0,8.0
961,961,A,38,3.0,3.0
494,494,AB,61,7.0,4.0


In [43]:
#Converting Dataframe to Dataset for Deepcheck uses
def load_dataset():
    df = load_Empdata_df('Train-employee-salary.csv')
    # Create Dataset objects for the diabetes dataframe
    ds = Dataset(df, label='salary', cat_features=['groups','healthy_eating','active_lifestyle'])
    return ds

In [44]:
# Training Linear Model
def train_linear_model(filename='trained_SalaryPrediction_linear_model'):
    try:
        X_train, X_test, y_train, y_test = split_dataframe()
        # Check data types and valid test_frac value
        assert isinstance(X_train, pd.DataFrame), "X_train must be a DataFrame"
        assert isinstance(X_test, pd.DataFrame), "y_train must be a DataFrame"
        assert isinstance(filename, str), "Filename must be a string"

        # Instantiate a Linear Regression model
        model = LinearRegression()

        # Fit the model with training data
        model.fit(X_train, y_train)  # Assuming 'target' is the column to predict

        # Save the trained model to a file
        fname = filename + '.joblib'
        dump(model, fname)

        # Compute R-squared scores for training and test data
        r2_train = model.score(X_train, y_train)
        r2_test = model.score(X_test, y_test)  # Assuming 'target' is the column to predict

        print("Train R-squared:", r2_train)
        print("Test R-squared:", r2_test)

        # Return scores in a dictionary
        return {'Train-score': r2_train, 'Test-score': r2_test}

    except AssertionError as msg:
        print(msg)
        return msg

In [45]:
train_linear_model()

ValueError: could not convert string to float: 'O'

In [None]:
# Deepcheck Data integrity check and saving the report into HTML
def data_integrity_check():
    ds = load_dataset()

    # Run Suite:
    integ_suite = data_integrity()
    suite_result = integ_suite.run(ds)

    # Save the result report as an HTML file
    suite_result.save_as_html("data_integrity_report.html")
