# Step 1: Import Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Step 2: Load the Dataset

In [12]:
train_dataset = pd.read_csv('/workspaces/150338625/Train_Dataset .csv')
test_dataset = pd.read_csv('/workspaces/150338625/Test_Dataset.csv')

# Step 3: Summary Statistics of the Dataset

In [5]:
print("Summary Statistics:")
print(train_dataset.describe())
print(test_dataset.describe())
print("\nData Types:")
print(train_dataset.dtypes)
print(test_dataset.dtypes)
print("\nMissing Values:")
print(train_dataset.isnull().sum())
print(test_dataset.isnull().sum())

Summary Statistics:
                 Id          Age          Sex           cp     trestbps  \
count   7303.000000  7303.000000  7303.000000  7303.000000  7303.000000   
mean   15021.535396    53.172669     0.499658     1.502533   147.447487   
std     2886.026080    14.185970     0.500034     1.115594    31.099538   
min    10001.000000    29.000000     0.000000     0.000000    94.000000   
25%    12521.500000    41.000000     0.000000     1.000000   120.000000   
50%    15054.000000    53.000000     0.000000     1.000000   148.000000   
75%    17513.500000    65.000000     1.000000     3.000000   174.000000   
max    19998.000000    77.000000     1.000000     3.000000   200.000000   

              chol          fbs      restecg      thalach        exang  \
count  7303.000000  7303.000000  7303.000000  7303.000000  7303.000000   
mean    342.805970     0.493085     1.013008   136.506093     0.503218   
std     127.291998     0.499986     0.815806    38.141966     0.500024   
min     

# Step 4: Identify Categorical Variables

In [9]:
categorical_columns = ['sex', 'cp', 'restecg', 'slope', 'ca', 'thal']


# Step 5: Encode categorical variables in the training dataset

In [13]:
train_dataset_encoded = pd.get_dummies(train_dataset, columns=categorical_columns, drop_first=True)

# Step 6: Encode categorical variables in the test dataset

In [14]:
test_dataset_encoded = pd.get_dummies(test_dataset, columns=categorical_columns, drop_first=True)

# Step 7: Align the columns of the test set to match the training set

In [15]:
test_dataset_encoded = test_dataset_encoded.reindex(columns=train_dataset_encoded.columns.drop('target'), fill_value=0)


# Step 8: Prepare features and target for the training dataset

In [16]:
X_train = train_dataset_encoded.drop(columns=['target'])  # Replace 'target' with your actual target column name
y_train = train_dataset_encoded['target']

# Step 9: Fit your model

In [17]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Step 10: Prepare features for the test dataset

In [18]:
X_test = test_dataset_encoded  # No need to drop 'target' as it doesn't exist in test dataset


# Step 11: Make predictions

In [19]:
predictions = model.predict(X_test)

# Step 12: Prepare the submission file

In [21]:
submission_df = pd.DataFrame({
    'ID': test_dataset['id'],
    'Target': predictions
})

# Step 13: Save to CSV

In [22]:
submission_df.to_csv('submission_file.csv', index=False)