<a href="https://colab.research.google.com/github/Shehab-Mechanical/codes/blob/main/HCT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Load Data
# Update these file paths as necessary based on your local setup.
train_file = "path_to_train.csv"  # Replace with the path to Train.csv
test_file = "path_to_test.csv"  # Replace with the path to Test.csv
data_dict_file = "path_to_data_dictionary.csv"  # Replace with Data Dictionary.csv
sample_submission_file = "path_to_sample_submission.csv"  # Replace with Sample Submission.csv

# Load the data
data_dict = pd.read_csv(data_dict_file)
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
sample_submission = pd.read_csv(sample_submission_file)

# Step 2: Data Exploration
print("Data Dictionary Preview:\n", data_dict.head())
print("\nTrain Data Preview:\n", train_data.head())
print("\nTest Data Preview:\n", test_data.head())
print("\nSample Submission Preview:\n", sample_submission.head())

print("\nTrain Data Info:\n")
train_data.info()

print("\nTest Data Info:\n")
test_data.info()

# Step 3: Handle Missing Values and Preprocessing
# Separate features and target
X = train_data.drop(columns=["ID", "TargetColumn"])  # Replace "TargetColumn" with the actual target column name.
y = train_data["TargetColumn"]

# Identify categorical and numerical columns
cat_features = X.select_dtypes(include=["object", "category"]).columns
num_features = X.select_dtypes(include=["int64", "float64"]).columns

# Preprocessing pipelines
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", LabelEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

# Step 4: Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Build the Model Pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

# Step 6: Train the Model
model.fit(X_train, y_train)

# Step 7: Evaluate the Model
val_predictions = model.predict(X_val)
mse = mean_squared_error(y_val, val_predictions)
print("Validation Mean Squared Error:", mse)

# Step 8: Predict on Test Data
X_test = test_data.drop(columns=["ID"])
test_predictions = model.predict(X_test)

# Step 9: Save Submission
submission = sample_submission.copy()
submission["prediction"] = test_predictions
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

# Step 10: Visualizations (Optional)
sns.histplot(y, kde=True)
plt.title("Target Variable Distribution")
plt.show()
