In [8]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# --- User config ---
DATA_PATH = "C:/Users/srushty/Downloads/archive/Titanic-Dataset.csv"  # <- change if your CSV has a different name/path
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Load data ---
df = pd.read_csv("C:/Users/srushty/Downloads/archive/Titanic-Dataset.csv" )

# Optional: standardize column names (lowercase)
df.columns = [c.strip().lower() for c in df.columns]

# --- Quick peek ---
print("Rows, cols:", df.shape)
print(df.head())

# --- Descriptive statistics ---
desc_numeric = df.describe()
desc_all = df.describe(include='all')
desc_numeric.to_csv(os.path.join(OUT_DIR, "descriptive_numeric.csv"))
desc_all.to_csv(os.path.join(OUT_DIR, "descriptive_all.csv"))

# --- Missing values summary ---
missing_count = df.isnull().sum().sort_values(ascending=False)
missing_pct = (df.isnull().mean() * 100).round(2).sort_values(ascending=False)
missing_df = pd.concat([missing_count, missing_pct], axis=1)
missing_df.columns = ["missing_count", "missing_percent"]
missing_df.to_csv(os.path.join(OUT_DIR, "missing_summary.csv"))
print("\nMissing summary:\n", missing_df.head(20))

# --- Numeric columns to analyze (attempt to auto-detect common numeric features) ---
possible_numeric = ['age', 'fare', 'sibsp', 'parch']
numeric_cols = [c for c in possible_numeric if c in df.columns]
if not numeric_cols:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns used:", numeric_cols)

# --- Histograms (one file per numeric column) ---
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    plt.hist(df[col].dropna(), bins=30)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    fname = os.path.join(OUT_DIR, f"hist_{col}.png")
    plt.savefig(fname)
    plt.close()

# --- Boxplots for selected numeric features (age, fare) if available ---
for col in ['age', 'fare']:
    if col in df.columns:
        plt.figure(figsize=(6,3))
        plt.boxplot(df[col].dropna())
        plt.title(f"Boxplot of {col}")
        plt.xticks([1], [col])
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR, f"box_{col}.png"))
        plt.close()

# --- Scatter: Age vs Fare, marker by survived if available ---
if 'age' in df.columns and 'fare' in df.columns:
    plt.figure(figsize=(6,4))
    if 'survived' in df.columns:
        # Plot survived=1 as 'o', survived=0 as 'x' (no specific color)
        for s_val, marker in [(1,'o'), (0,'x')]:
            subset = df[(df['survived'] == s_val) & df['age'].notnull() & df['fare'].notnull()]
            if not subset.empty:
                plt.scatter(subset['age'], subset['fare'], marker=marker, alpha=0.7, label=f"survived={s_val}")
        plt.legend()
    else:
        plt.scatter(df['age'].dropna(), df['fare'].dropna(), alpha=0.6)
    plt.xlabel("Age")
    plt.ylabel("Fare")
    plt.title("Scatter: Age vs Fare")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "scatter_age_fare.png"))
    plt.close()

# --- Correlation matrix (numeric only) using matplotlib imshow ---
if numeric_cols:
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(5,4))
    im = plt.imshow(corr.values, interpolation='nearest')
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
    plt.yticks(range(len(corr.index)), corr.index)
    plt.title("Correlation matrix (numeric features)")
    plt.colorbar(im)
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "corr_matrix.png"))
    plt.close()
    corr.to_csv(os.path.join(OUT_DIR, "correlation_matrix.csv"))

# --- Pairwise scatter matrix (pandas) ---
if len(numeric_cols) >= 2:
    axes = scatter_matrix(df[numeric_cols].dropna(), alpha=0.6, diagonal='hist', figsize=(9,9))
    plt.suptitle("Scatter matrix")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "scatter_matrix.png"))
    plt.close()

# --- VIF (Variance Inflation Factor) to check multicollinearity ---
vif_results = None
try:
    vif_data = df[numeric_cols].dropna()
    X = sm.add_constant(vif_data)
    vif_list = []
    for i in range(1, X.shape[1]):  # skip const at 0
        vif_val = variance_inflation_factor(X.values, i)
        vif_list.append({'feature': X.columns[i], 'VIF': vif_val})
    vif_results = pd.DataFrame(vif_list)
    vif_results.to_csv(os.path.join(OUT_DIR, "vif_numeric.csv"), index=False)
    print("\nVIF results:\n", vif_results)
except Exception as e:
    print("VIF calculation skipped or failed:", e)

# --- Derived features: family_size (if sibsp/parch exist) ---
if 'sibsp' in df.columns and 'parch' in df.columns:
    df['family_size'] = df['sibsp'].fillna(0) + df['parch'].fillna(0) + 1
    df[['family_size']].to_csv(os.path.join(OUT_DIR, "family_size_sample.csv"))

# --- Simple insights (automatically produced) ---
insights = []
# Missingness
if 'age' in df.columns:
    pct_age_missing = df['age'].isnull().mean() * 100
    insights.append(f"Age missing: {pct_age_missing:.2f}%")
# Skewness/fare
if 'fare' in df.columns:
    skew_fare = df['fare'].skew()
    insights.append(f"Fare skewness: {skew_fare:.2f} (right skew if positive)")
# Outliers check (simple)
if 'fare' in df.columns:
    q1 = df['fare'].quantile(0.25)
    q3 = df['fare'].quantile(0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    insights.append(f"Fare upper fence: {upper:.2f}; values above this are potential outliers.")

# Save insights
with open(os.path.join(OUT_DIR, "insights.txt"), "w") as f:
    f.write("\n".join(insights))

print("\nED A finished. Outputs saved to:", OUT_DIR)
print("Generated files:", os.listdir(OUT_DIR))

Rows, cols: (891, 12)
   passengerid  survived  pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                name     sex   age  sibsp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   parch            ticket     fare cabin embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.05