In [None]:
# 1_data_exploration_visualization.py
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('Titanic-Dataset.csv')
df.head()
df.info()
df.describe()
df.isnull().sum()
df.duplicated().sum()
df.nunique()

sns.countplot(x='Sex', data=df, palette='pastel')
plt.title('Passenger Count by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

sns.countplot(data=df, x='Pclass', palette='Set2')
plt.title('Count of Passengers by Class')
plt.xlabel('Passenger Class')
plt.ylabel('Count')
plt.show()

sns.countplot(data=df, x='Embarked', palette='Set3')
plt.title('Count of Passengers by Embarkation Port')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()

sns.countplot(x='Survived', data=df, palette='Set1')
plt.xticks([0, 1], ['Did Not Survive', 'Survived'])
plt.title('Survival Count')
plt.xlabel('Survival Status')
plt.ylabel('Number of Passengers')
plt.show()

numeric_cols = df.select_dtypes(include='number').columns
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=False, color='skyblue', edgecolor='black')
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

sns.pairplot(df.select_dtypes(include=['float64', 'int64']), hue='Survived', palette='coolwarm')
plt.suptitle('Scatter Plots for All Numerical Features', y=1.02)
plt.show()

numeric_df = df.select_dtypes(include=['int64', 'float64'])
correlation = numeric_df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.show()

# 2_feature_engineering.py
df['family_size'] = df['Parch'] + df['SibSp']
df = df.drop(columns=['Parch', 'SibSp'])

def get_family_type(size):
    if size == 0:
        return 'Alone'
    elif size <= 2:
        return 'Small'
    elif size <= 4:
        return 'Medium'
    else:
        return 'Large'
df['family_type'] = df['family_size'].apply(get_family_type)
df = df.drop(columns=['family_size'])

pd.crosstab(df['family_type'], df['Survived'], normalize='index') * 100
pd.crosstab(df['Sex'], df['Survived'], normalize='index') * 100
pd.crosstab(df['Pclass'], df['Survived'], normalize='index') * 100
pd.crosstab(df['Embarked'], df['Survived'], normalize='index') * 100

df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)
title_mapping = {
    'Mr': 'Mr','Mrs': 'Mrs','Miss': 'Mrs', 'Ms': 'Mrs','Mme': 'Mrs','Mlle': 'Mrs','Master': 'Mr', 'Dr': 'Officers',
    'Rev': 'Officers','Col': 'Officers','Major': 'Officers','Capt': 'Officers','Sir': 'VIP','Lady': 'VIP','Don': 'VIP',
    'Jonkheer': 'VIP','the Countess': 'VIP'}
df['Title'] = df['Title'].map(title_mapping)
df = df.drop(columns=['Name', 'PassengerId', 'Ticket'])

df['fare_category'] = pd.cut(df['Fare'], bins=[0, 10, 50, 100, float('inf')], labels=['Low', 'Medium', 'High', 'Very High'])
df['age_category'] = pd.cut(df['Age'], bins=[0, 12, 19, 59, float('inf')], labels=['Child', 'Teen', 'Adult', 'Senior'])
df = df.drop(columns=['Fare', 'Age'])

pd.crosstab([df['fare_category'], df['age_category']], df['Survived'], normalize='index') * 100
pd.crosstab(df['age_category'], df['Survived'], normalize='index') * 100
pd.crosstab(df['fare_category'], df['Survived'], normalize='index') * 100

df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['family_type'] = df['family_type'].astype('category')
df['Title'] = df['Title'].astype('category')
pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'family_type', 'fare_category', 'age_category','Pclass'], drop_first=True)

# 3_model_training_random_forest.py
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

X = df.drop(columns=['Survived'])
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1]):.2f}")

joblib.dump(rf_model, 'random_forest_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(X_train.columns.tolist(), 'feature_columns.joblib')

# 4_categorical_analysis.py
import seaborn as sns
import matplotlib.pyplot as plt

# Plot bar plot for 'family_size' or similar column
# (Assuming the column is not dropped yet in some version)
df['family_size'] = df['Parch'] + df['SibSp']
df['family_size'].value_counts().plot(kind='bar')
plt.xlabel('Port of Embarkation')
plt.ylabel('Number of Passengers')
plt.title('Number of Passengers per Embarked Port')
plt.show()

titanic = df.copy()
titanic['Family_type'] = titanic['SibSp'] + titanic['Parch']
titanic['Family_type'] = titanic['Family_type'].apply(lambda x: 'Alone' if x == 0 else 'With Family')
plt.figure(figsize=(4, 5))
sns.countplot(data=titanic, x='Family_type', hue='Survived')
plt.xlabel('Family Type')
plt.ylabel('Number of Passengers')
plt.title('Comparison of Survived vs Family Type')
plt.legend(title='Survived', loc='upper left', labels=['Not Survived', 'Survived'])
plt.show()

survival_counts = titanic.groupby(['Family_type', 'Survived']).size().unstack()
survival_counts.plot(kind='bar', stacked=True, figsize=(4, 3), color=['red', 'green'])
plt.xlabel('Family Type')
plt.ylabel('Number of Passengers')
plt.title('Comparison of Survived vs Family Type (Stacked)')
plt.xticks(rotation=0)
plt.legend(title='Survived', labels=['Not Survived', 'Survived'])
plt.show()

# Statistical insights
from scipy.stats import chi2_contingency
crosstab = pd.crosstab(df['Survived'], df['Pclass'])
chi2, p, _, _ = chi2_contingency(crosstab)
print(crosstab)
print(f"Chi2 Stat: {chi2:.2f}, p-value: {p:.4f}")

grouped_data = df.groupby(['Pclass', 'Sex'])['Survived'].sum()
print(grouped_data)

# Summary stats for categorical
value_counts = df['Title'].value_counts()
print(value_counts)
frequency_distribution = df['Title'].value_counts(normalize=True)
print(frequency_distribution)
categorical_description = df.describe(include=['object'])
print(categorical_description)
mode_value = df['Title'].mode()[0]
print(mode_value)
cardinality = df['Title'].nunique()
print(cardinality)
