In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Upload your dataset
from google.colab import files
uploaded = files.upload()  # Upload data.csv

# Load the dataset
df = pd.read_csv(list(uploaded.keys())[0])  # Read the uploaded CSV file
print("Dataset loaded successfully!")

# Display basic dataset information
print("\nDataset Overview:")
display(df.head())

print("\nGeneral Information:")
df.info()

print("\nSummary Statistics:")
display(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Visualizing missing values
plt.figure(figsize=(10, 5))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.show()

# Distribution of numerical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
for column in numerical_columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[column], kde=True, bins=30)
    plt.title(f"Distribution of {column}")
    plt.show()

# Preprocessing: Handle missing values
df_cleaned = df.dropna()  # Drop rows with missing values

# Encode categorical columns
categorical_columns = df_cleaned.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_columns, drop_first=True)

# Normalize numerical columns
numerical_columns = df_encoded.select_dtypes(include=['int64', 'float64']).columns
df_encoded[numerical_columns] = (
    df_encoded[numerical_columns] - df_encoded[numerical_columns].min()
) / (df_encoded[numerical_columns].max() - df_encoded[numerical_columns].min())

# Display the processed dataset
print("\nProcessed Dataset:")
display(df_encoded.head())

# Save the processed dataset
df_encoded.to_csv("processed_data.csv", index=False)

# Download the processed dataset
from google.colab import files
files.download("processed_data.csv")
