<a href="https://colab.research.google.com/github/Wazir123456/The-Developer-Arena-Internship-Tasks-8/blob/main/Developer_Arena_Task_8_%26_screenshot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =========================================================
# COMPLETE DATA SCIENCE PIPELINE (PHASE 1 ‚Üí PHASE 5)
# Works for multiple datasets automatically
# =========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

sns.set()

# ---------------------------------------------------------
# DATASETS (paths already uploaded in your environment)
# ---------------------------------------------------------
datasets = {
    "House Prices": "/content/house_prices.csv",
    "Customer Churn": "/content/customer_churn.csv",
    "Sales Data": "/content/sales_data.csv"
}

output_folder = "results"
os.makedirs(output_folder, exist_ok=True)


# =========================================================
# PHASE 1 ‚Äì PROJECT PLANNING
# =========================================================
def project_planning(name, df):
    print("\n" + "="*60)
    print(f"üìå PROJECT: {name}")
    print("="*60)

    print("Problem: Extract insights & build data understanding")
    print("Objectives: Clean ‚Üí Explore ‚Üí Analyze ‚Üí Recommend")
    print("Success Metrics: data quality, patterns found, statistical validity")

    print("\nShape:", df.shape)
    print("Columns:", list(df.columns))


# =========================================================
# PHASE 2 ‚Äì DATA CLEANING
# =========================================================
def clean_data(df):
    print("\nüîπ Cleaning Data...")

    # Remove duplicates
    df = df.drop_duplicates()

    # Fill missing values
    for col in df.columns:
        if df[col].dtype == "object":
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

    return df


# =========================================================
# PHASE 3 ‚Äì EDA + VISUALIZATION
# =========================================================
def eda_analysis(name, df):

    print("\nüîπ Running EDA...")

    print("\nSummary Stats")
    print(df.describe())

    print("\nMissing Values")
    print(df.isnull().sum())

    numeric_cols = df.select_dtypes(include=np.number).columns

    # Histograms
    for col in numeric_cols:
        plt.figure()
        df[col].hist(bins=30)
        plt.title(f"{name} - {col} Distribution")
        plt.savefig(f"{output_folder}/{name}_{col}_hist.png")
        plt.close()

    # Correlation heatmap
    if len(numeric_cols) > 1:
        plt.figure(figsize=(8,6))
        sns.heatmap(df[numeric_cols].corr(), annot=True)
        plt.title(f"{name} Correlation Heatmap")
        plt.savefig(f"{output_folder}/{name}_correlation.png")
        plt.close()


# =========================================================
# PHASE 4 ‚Äì ADVANCED STATISTICAL ANALYSIS
# =========================================================
def advanced_analysis(df):

    print("\nüîπ Statistical Analysis...")

    numeric_cols = df.select_dtypes(include=np.number).columns

    results = {}

    if len(numeric_cols) >= 2:
        col1, col2 = numeric_cols[0], numeric_cols[1]

        corr, p = stats.pearsonr(df[col1], df[col2])

        results["Correlation"] = corr
        results["p_value"] = p

        print(f"Correlation between {col1} & {col2} = {corr:.3f} (p={p:.4f})")

    return results


# =========================================================
# PHASE 5 ‚Äì INSIGHTS & RECOMMENDATIONS
# =========================================================
def insights(name, df, stats_results):

    print("\nüîπ Generating Insights...")

    insights_list = []

    insights_list.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} features.")

    if "Correlation" in stats_results:
        if abs(stats_results["Correlation"]) > 0.7:
            insights_list.append("Strong relationship detected between major variables.")
        else:
            insights_list.append("Weak/Moderate relationships detected.")

    insights_list.append("Data cleaned and ready for modeling.")
    insights_list.append("Recommend applying ML models or predictive analysis next.")

    with open(f"{output_folder}/{name}_insights.txt", "w") as f:
        for i in insights_list:
            f.write(i + "\n")

    print("\n".join(insights_list))


# =========================================================
# MASTER PIPELINE
# =========================================================
def run_pipeline(name, path):

    df = pd.read_csv(path)

    project_planning(name, df)

    df = clean_data(df)

    eda_analysis(name, df)

    stats_results = advanced_analysis(df)

    insights(name, df, stats_results)


# =========================================================
# RUN FOR ALL DATASETS
# =========================================================
for name, path in datasets.items():
    run_pipeline(name, path)

print("\n‚úÖ All datasets processed successfully!")
print("üìÅ Check 'results/' folder for charts & insights files")


üìå PROJECT: House Prices
Problem: Extract insights & build data understanding
Objectives: Clean ‚Üí Explore ‚Üí Analyze ‚Üí Recommend
Success Metrics: data quality, patterns found, statistical validity

Shape: (300, 8)
Columns: ['Property_ID', 'Area', 'Bedrooms', 'Bathrooms', 'Age', 'Location', 'Property_Type', 'Price']

üîπ Cleaning Data...

üîπ Running EDA...

Summary Stats
             Area    Bedrooms   Bathrooms         Age         Price
count   300.00000  300.000000  300.000000  300.000000  3.000000e+02
mean   2759.70000    3.033333    2.026667   25.000000  2.488366e+07
std    1297.68143    1.467219    0.792495   14.332646  1.266525e+07
min     520.00000    1.000000    1.000000    0.000000  3.695000e+06
25%    1675.75000    2.000000    1.000000   12.000000  1.527750e+07
50%    2738.00000    3.000000    2.000000   25.500000  2.236500e+07
75%    3801.25000    4.000000    3.000000   36.250000  3.460812e+07
max    4999.00000    5.000000    3.000000   49.000000  5.870000e+07

Mis

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)



üîπ Statistical Analysis...
Correlation between Area & Bedrooms = -0.004 (p=0.9420)

üîπ Generating Insights...
Dataset has 300 rows and 8 features.
Weak/Moderate relationships detected.
Data cleaned and ready for modeling.
Recommend applying ML models or predictive analysis next.

üìå PROJECT: Customer Churn
Problem: Extract insights & build data understanding
Objectives: Clean ‚Üí Explore ‚Üí Analyze ‚Üí Recommend
Success Metrics: data quality, patterns found, statistical validity

Shape: (500, 9)
Columns: ['CustomerID', 'Tenure', 'MonthlyCharges', 'TotalCharges', 'Contract', 'PaymentMethod', 'PaperlessBilling', 'SeniorCitizen', 'Churn']

üîπ Cleaning Data...

üîπ Running EDA...

Summary Stats
           Tenure  MonthlyCharges  TotalCharges  SeniorCitizen       Churn
count  500.000000      500.000000    500.000000     500.000000  500.000000
mean    36.532000      113.636000   4237.882000       0.498000    0.106000
std     20.667057       51.799903   2260.619837       0.500497  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)



üîπ Statistical Analysis...
Correlation between Tenure & MonthlyCharges = -0.060 (p=0.1829)

üîπ Generating Insights...
Dataset has 500 rows and 9 features.
Weak/Moderate relationships detected.
Data cleaned and ready for modeling.
Recommend applying ML models or predictive analysis next.

üìå PROJECT: Sales Data
Problem: Extract insights & build data understanding
Objectives: Clean ‚Üí Explore ‚Üí Analyze ‚Üí Recommend
Success Metrics: data quality, patterns found, statistical validity

Shape: (100, 7)
Columns: ['Date', 'Product', 'Quantity', 'Price', 'Customer_ID', 'Region', 'Total_Sales']

üîπ Cleaning Data...

üîπ Running EDA...

Summary Stats
         Quantity         Price    Total_Sales
count  100.000000    100.000000     100.000000
mean     4.780000  25808.510000  123650.480000
std      2.588163  13917.630242  100161.085275
min      1.000000   1308.000000    6540.000000
25%      2.750000  14965.250000   39517.500000
50%      5.000000  24192.000000   97955.500000
75%      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)



üîπ Statistical Analysis...
Correlation between Quantity & Price = 0.008 (p=0.9369)

üîπ Generating Insights...
Dataset has 100 rows and 7 features.
Weak/Moderate relationships detected.
Data cleaned and ready for modeling.
Recommend applying ML models or predictive analysis next.

‚úÖ All datasets processed successfully!
üìÅ Check 'results/' folder for charts & insights files
