In [None]:
# Analyze target variable (replace 'PRICE' with your target column name)
target_col = 'PRICE'
problem_type = analyze_target(df, target_col)

# ### 3.5 Feature Relationships Analysis

def analyze_feature_relationships(df, target_col, problem_type, max_features=10):
    """
    Analyze relationships between features and target
    
    How to interpret:
    - Correlation heatmap: Darker colors (closer to ±1) indicate stronger correlations
    - Feature-target relationships: Look for clear patterns that suggest predictive value
    - Multicollinearity: Features with correlation >0.7 may be redundant
    """
    # Select numeric columns only (excluding target)
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    numeric_cols = [col for col in numeric_cols if col != target_col]
    
    if len(numeric_cols) == 0:
        print("No numeric features to analyze relationships.")
        return
    
    # If too many features, select the top ones most correlated with target
    if len(numeric_cols) > max_features:
        print(f"Too many features ({len(numeric_cols)}). Selecting top {max_features} most correlated with target.")
        correlations = df[numeric_cols].corrwith(df[target_col]).abs().sort_values(ascending=False)
        numeric_cols = correlations.head(max_features).index.tolist()
    
    # Include target in correlation matrix
    corr_cols = numeric_cols + [target_col]
    
    # Create correlation matrix
    corr_matrix = df[corr_cols].corr()
    
    # Plot correlation heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Create mask for upper triangle
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', mask=mask, 
                linewidths=0.5, vmin=-1, vmax=1)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Identify highly correlated feature pairs (potential multicollinearity)
    print("\nPotential multicollinearity (feature pairs with |correlation| > 0.7):")
    high_corr_pairs = []
    for i in range(len(numeric_cols)):
        for j in range(i+1, len(numeric_cols)):
            corr = abs(corr_matrix.iloc[i, j])
            if corr > 0.7:
                high_corr_pairs.append((numeric_cols[i], numeric_cols[j], corr))
    
    if high_corr_pairs:
        high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])
        high_corr_df = high_corr_df.sort_values('Correlation', ascending=False)
        display(high_corr_df)
    else:
        print("No highly correlated feature pairs found.")
    
    # Plot relationship between each feature and target
    if problem_type == 'regression':
        # For regression problems, use scatter plots
        fig, axes = plt.subplots(nrows=(len(numeric_cols)+1)//2, ncols=2, figsize=(15, 3*(len(numeric_cols)+1)//2))
        axes = axes.flatten()
        
        for i, col in enumerate(numeric_cols):
            if i < len(axes):
                sns.regplot(x=df[col], y=df[target_col], ax=axes[i], scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
                axes[i].set_title(f'{col} vs {target_col}')
                axes[i].set_xlabel(col)
                axes[i].set_ylabel(target_col)
        
        # Hide unused subplots
        for i in range(len(numeric_cols), len(axes)):
            axes[i].set_visible(False)
            
        plt.tight_layout()
        plt.show()
        
    else:  # classification
        # For classification problems, use box plots
        for col in numeric_cols:
            plt.figure(figsize=(10, 6))
            sns.boxplot(data=df, x=target_col, y=col)
            plt.title(f'{col} by {target_col}')
            plt.tight_layout()
            plt.show()
    
    # Feature importance using a basic model
    print("\nPreliminary feature importance:")
    X = df[numeric_cols]
    y = df[target_col]
    
    try:
        if problem_type == 'regression':
            model = RandomForestRegressor(n_estimators=100, random_state=42)
        else:
            model = RandomForestClassifier(n_estimators=100, random_state=42)
        
        model.fit(X, y)
        
        # Plot feature importance
        importance_df = pd.DataFrame({
            'Feature': numeric_cols,
            'Importance': model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importance_df, x='Importance', y='Feature')
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.show()
        
        print("\nFeature importance ranking:")
        display(importance_df)
    except Exception as e:
        print(f"Could not compute feature importance: {e}")

# Analyze feature relationships
analyze_feature_relationships(df, target_col, problem_type)

# ## 4. Data Preprocessing

# ### 4.1 Handle Missing Values

def handle_missing_values(df, strategy='auto'):
    """
    Handle missing values in the dataset
    
    Parameters:
    - df: DataFrame with the data
    - strategy: 'auto', 'remove', 'impute'
    
    Returns:
    - DataFrame with handled missing values
    - Dictionary with imputers (if applicable)
    """
    print("Missing values analysis:")
    missing = df.isnull().sum()
    missing_percent = (missing / len(df)) * 100
    missing_data = pd.DataFrame({'Total': missing, 'Percent': missing_percent})
    missing_data = missing_data[missing_data['Total'] > 0].sort_values('Total', ascending=False)
    
    if missing_data.empty:
        print("No missing values found.")
        return df, {}
    
    display(missing_data)
    
    # Auto strategy selection
    if strategy == 'auto':
        # If any column has >30% missing, suggest removing the column
        cols_to_drop = missing_data[missing_data['Percent'] > 30].index.tolist()
        
        if cols_to_drop:
            print(f"\nRecommendation: Consider dropping these columns with >30% missing values: {cols_to_drop}")
            user_input = input("Drop these columns? (y/n): ")
            if user_input.lower() == 'y':
                df = df.drop(columns=cols_to_drop)
                print(f"Dropped columns: {cols_to_drop}")
        
        # If any rows have many missing values, suggest removing those rows
        threshold = 0.5 * df.shape[1]  # If row is missing > 50% of values
        rows_to_drop = df[df.isnull().sum(axis=1) > threshold].index
        
        if len(rows_to_drop) > 0:
            percent_rows = len(rows_to_drop) / len(df) * 100
            print(f"\nFound {len(rows_to_drop)} rows ({percent_rows:.2f}%) with >50% missing values.")
            user_input = input("Drop these rows? (y/n): ")
            if user_input.lower() == 'y':
                df = df.drop(index=rows_to_drop)
                print(f"Dropped {len(rows_to_drop)} rows.")
        
        strategy = 'impute'  # Default to imputation for remaining missing values
    
    # If strategy is remove, drop all rows with any missing values
    if strategy == 'remove':
        rows_before = len(df)
        df = df.dropna()
        rows_after = len(df)
        print(f"Dropped {rows_before - rows_after} rows with missing values.")
        return df, {}
    
    # If strategy is impute, impute missing values
    if strategy == 'impute':
        imputers = {}
        
        # Handle numeric columns
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        numeric_cols_with_missing = [col for col in numeric_cols if df[col].isnull().sum() > 0]
        
        if numeric_cols_with_missing:
            print("\nImputing numeric columns...")
            
            # Allow user to choose imputation method for numeric features
            print("Imputation methods for numeric columns:")
            print("1. Mean")
            print("2. Median")
            print("3. KNN imputation")
            user_input = input("Choose method (1/2/3, default=2): ")
            
            if user_input == '1':
                imputer = SimpleImputer(strategy='mean')
                method = 'mean'
            elif user_input == '3':
                imputer = KNNImputer(n_neighbors=5)
                method = 'knn'
            else:
                imputer = SimpleImputer(strategy='median')
                method = 'median'
            
            # Fit and transform
            df_numeric = df[numeric_cols_with_missing].copy()
            imputer.fit(df_numeric)
            df[numeric_cols_with_missing] = imputer.transform(df_numeric)
            imputers['numeric'] = imputer
            print(f"Imputed numeric columns using {method} strategy.")
        
        # Handle categorical columns
        cat_cols = df.select_dtypes(include=['object', 'category']).columns
        cat_cols_with_missing = [col for col in cat_cols if df[col].isnull().sum() > 0]
        
        if cat_cols_with_missing:
            print("\nImputing categorical columns...")
            
            # Allow user to choose imputation method for categorical features
            print("Imputation methods for categorical columns:")
            print("1. Most frequent value")
            print("2. Add 'Missing' category")
            user_input = input("Choose method (1/2, default=1): ")
            
            if user_input == '2':
                # Add 'Missing' category
                for col in cat_cols_with_missing:
                    df[col] = df[col].fillna('Missing')
                print("Added 'Missing' category to categorical columns with missing values.")
            else:
                # Most frequent
                imputer = SimpleImputer(strategy='most_frequent')
                df_cat = df[cat_cols_with_missing].copy()
                imputer.fit(df_cat)
                df[cat_cols_with_missing] = imputer.transform(df_cat)
                imputers['categorical'] = imputer
                print("Imputed categorical columns using most frequent value.")
    
    return df, imputers

# Handle missing values (uncomment to run)
# df, imputers = handle_missing_values(df)

# ### 4.2 Handle Outliers

def handle_outliers(df, columns=None, method='iqr', treatment='cap'):
    """
    Detect and handle outliers in the dataset
    
    Parameters:
    - df: DataFrame with the data
    - columns: List of columns to check for outliers (default=None, which means all numeric)
    - method: 'iqr' (Interquartile Range) or 'zscore'
    - treatment: 'cap' (cap at bounds), 'remove' (remove rows), or 'none' (just identify)
    
    Returns:
    - DataFrame with handled outliers
    - Dictionary with outlier information
    """
    if columns is None:
        columns = df.select_dtypes(include=['int64', 'float64']).columns
    
    outlier_info = {}
    total_outliers = 0
    
    print(f"Detecting outliers using {method} method...")
    
    for col in columns:
        if method == 'iqr':
            # IQR method
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
            outlier_indices = outliers.index
            
        else:  # zscore method
            # Z-score method
            z_scores = np.abs(stats.zscore(df[col].dropna()))
            z_score_indices = df[col].dropna().index
            
            # Identify outliers (z-score > 3)
            outlier_mask = z_scores > 3
            outlier_indices = z_score_indices[outlier_mask]
            outliers = df.loc[outlier_indices, col]
            
            # Calculate bounds for capping
            mean, std = df[col].mean(), df[col].std()
            lower_bound = mean - 3 * std
            upper_bound = mean + 3 * std
        
        # Store outlier information
        n_outliers = len(outliers)
        if n_outliers > 0:
            percent_outliers = n_outliers / len(df) * 100
            outlier_info[col] = {
                'count': n_outliers,
                'percent': percent_outliers,
                'indices': outlier_indices,
                'bounds': (lower_bound, upper_bound)
            }
            total_outliers += n_outliers
            
            print(f"{col}: Found {n_outliers} outliers ({percent_outliers:.2f}% of data)")
            
            # Handle outliers according to specified treatment
            if treatment == 'cap':
                # Cap outliers at the bounds
                df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
                print(f"  - Capped outliers at lower bound: {lower_bound:.2f}, upper bound: {upper_bound:.2f}")
                
            elif treatment == 'remove' and n_outliers > 0:
                # Store indices for later removal
                outlier_info[col]['remove_indices'] = outlier_indices
    
    # If removing outliers, do it after processing all columns to avoid index changes
    if treatment == 'remove' and total_outliers > 0:
        # Collect all indices to remove across columns
        all_indices = set()
        for col, info in outlier_info.items():
            if 'remove_indices' in info:
                all_indices.update(info['remove_indices'])
        
        # Remove rows with outliers
        df = df.drop(index=all_indices)
        print(f"\nRemoved {len(all_indices)} rows with outliers.")
    
    if total_outliers == 0:
        print("No outliers detected.")
    
    return df, outlier_info

# Handle outliers (uncomment to run)
# df, outlier_info = handle_outliers(df, method='iqr', treatment='cap')

# ### 4.3 Feature Transformation and Encoding

def transform_features(df, target_col=None, problem_type=None):
    """
    Apply various transformations to features
    
    Parameters:
    - df: DataFrame with the data
    - target_col: Name of target column (to exclude

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0
