# Outlier Detection Using IQR Method

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)

In [None]:
# Load the preprocessed data from datatype_analysis
# Run datatype_analysis.ipynb first up to cell 9 to get the cleaned data

# For now, let's load and preprocess the data here
df = pd.read_csv('Mobile.csv')

# Convert string columns to numeric
df['Battery_power_mAh'] = df['Battery_power_mAh'].str.replace(' mAh', '').astype(int)
df['Ram_mb'] = df['Ram_mb'].str.replace(' mb', '').astype(int)
df['Internal_memeory_gb'] = df['Internal_memeory_gb'].str.replace(' gb', '').astype(int)
df['Primary_camera'] = df['Primary_camera'].str.replace(' pixels', '').astype(int)
df['Front_camera'] = df['Front_camera'].str.replace(' pixels', '').astype(int)
df['Mobile_weight'] = df['Mobile_weight'].str.replace(' g', '').astype(int)
df['px_height'] = df['px_height'].str.replace(' ppcm', '').astype(int)
df['Pixel_width'] = df['Pixel_width'].str.replace(' ppcm', '').astype(int)
df['Screen_height'] = df['Screen_height'].str.replace(' cm', '').astype(int)
df['Mobile_depth'] = df['Mobile_depth'].str.replace(' cm', '').astype(float)

# Convert binary columns
binary_columns = ['Bluetooh', 'Dual_sim', '4G', '3G', 'touch_screen', 'wifi']
for col in binary_columns:
    if col in df.columns:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

# Remove unnecessary columns
columns_to_remove = ['Screen_height', 'Screen_weight', '3G']
df = df.drop(columns=columns_to_remove)

print(f"Dataset shape: {df.shape}")
print(f"Data preprocessing completed!")

In [None]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(data, column):
    """
    Detect outliers using the Interquartile Range (IQR) method.
    Also treats any negative values as outliers since they are invalid for mobile specs.
    
    Parameters:
    - data: DataFrame
    - column: Column name to check for outliers
    
    Returns:
    - Dictionary with outlier information
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Outliers: values outside IQR bounds OR negative values (invalid for mobile specs)
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound) | (data[column] < 0)]
    
    # Track negative values separately for reporting
    negative_values = data[data[column] < 0]
    
    return {
        'column': column,
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'num_outliers': len(outliers),
        'outlier_percentage': (len(outliers) / len(data)) * 100,
        'outlier_indices': outliers.index.tolist(),
        'num_negative': len(negative_values),
        'negative_indices': negative_values.index.tolist()
    }

print("Outlier detection function created!")

In [None]:
# Get all numerical columns
all_numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Define binary columns to exclude from outlier detection
binary_columns = ['Bluetooh', 'Dual_sim', '4G', 'touch_screen', 'wifi']

# Filter out binary columns
numerical_columns = [col for col in all_numerical_columns if col not in binary_columns]

print("Binary columns (excluded from outlier detection):")
for col in binary_columns:
    if col in all_numerical_columns:
        print(f"  - {col}")

print("\nNumerical columns for outlier detection:")
for i, col in enumerate(numerical_columns, 1):
    print(f"{i}. {col}")
    
print(f"\nTotal numerical columns for analysis: {len(numerical_columns)}")

In [None]:
# Detect outliers for all numerical columns
outlier_results = {}

print("="*80)
print("OUTLIER DETECTION USING IQR METHOD")
print("="*80)

for col in numerical_columns:
    result = detect_outliers_iqr(df, col)
    outlier_results[col] = result
    
    print(f"\n{col}:")
    print(f"  Q1 (25th percentile): {result['Q1']:.2f}")
    print(f"  Q3 (75th percentile): {result['Q3']:.2f}")
    print(f"  IQR: {result['IQR']:.2f}")
    print(f"  Lower Bound: {result['lower_bound']:.2f}")
    print(f"  Upper Bound: {result['upper_bound']:.2f}")
    print(f"  Number of Outliers: {result['num_outliers']}")
    print(f"  Outlier Percentage: {result['outlier_percentage']:.2f}%")
    
    # Display negative value information if any exist
    if result['num_negative'] > 0:
        print(f"  WARNING: {result['num_negative']} negative values detected (included in outlier count)")

In [None]:
# Create summary dataframe of outlier statistics
outlier_summary = pd.DataFrame({
    'Feature': [result['column'] for result in outlier_results.values()],
    'Q1': [result['Q1'] for result in outlier_results.values()],
    'Q3': [result['Q3'] for result in outlier_results.values()],
    'IQR': [result['IQR'] for result in outlier_results.values()],
    'Lower Bound': [result['lower_bound'] for result in outlier_results.values()],
    'Upper Bound': [result['upper_bound'] for result in outlier_results.values()],
    'Outlier Count': [result['num_outliers'] for result in outlier_results.values()],
    'Outlier %': [round(result['outlier_percentage'], 2) for result in outlier_results.values()]
})

# Sort by outlier count (descending)
outlier_summary = outlier_summary.sort_values('Outlier Count', ascending=False).reset_index(drop=True)

print("="*80)
print("OUTLIER SUMMARY TABLE")
print("="*80)
print(outlier_summary.to_string(index=False))

# Highlight features with significant outliers (>5%)
print("\n" + "="*80)
print("FEATURES WITH SIGNIFICANT OUTLIERS (>5%)")
print("="*80)
significant_outliers = outlier_summary[outlier_summary['Outlier %'] > 5.0]
if len(significant_outliers) > 0:
    print(significant_outliers[['Feature', 'Outlier Count', 'Outlier %']].to_string(index=False))
else:
    print("No features have more than 5% outliers.")

In [None]:
# Analyze specific outliers to determine if they are legitimate or errors
def analyze_outliers(df, column, outlier_indices):
    """
    Analyze outlier values to determine if they are data errors or legitimate extreme values
    """
    print(f"\nAnalyzing outliers in: {column}")
    print("-" * 60)
    
    outlier_values = df.loc[outlier_indices, column].values
    
    # Statistics
    print(f"Total outliers: {len(outlier_values)}")
    print(f"Min outlier value: {outlier_values.min()}")
    print(f"Max outlier value: {outlier_values.max()}")
    print(f"Mean of outliers: {outlier_values.mean():.2f}")
    print(f"Overall column min: {df[column].min()}")
    print(f"Overall column max: {df[column].max()}")
    print(f"Overall column mean: {df[column].mean():.2f}")
    
    # Show ALL outlier records
    print(f"\nAll outlier records:")
    print(df.loc[outlier_indices, [column, 'price_range']])
    
    # Show price range distribution
    print(f"\nPrice range distribution of outliers:")
    print(df.loc[outlier_indices, 'price_range'].value_counts())
    
    return outlier_values

# Analyze features with most outliers
print("="*80)
print("DETAILED OUTLIER ANALYSIS")
print("="*80)

# Analyze top 3 features with most outliers
top_outlier_features = outlier_summary.head(3)['Feature'].tolist()

for feature in top_outlier_features:
    if outlier_results[feature]['num_outliers'] > 0:
        analyze_outliers(df, feature, outlier_results[feature]['outlier_indices'])

In [None]:
# Create box plots to visualize outliers
fig, axes = plt.subplots(4, 3, figsize=(15, 16))

axes = axes.flatten()

for idx, col in enumerate(numerical_columns):
    if idx < len(axes):
        # Create box plot
        axes[idx].boxplot(df[col], vert=True, patch_artist=True,
                         boxprops=dict(facecolor='lightblue', alpha=0.7),
                         medianprops=dict(color='red', linewidth=2),
                         whiskerprops=dict(color='blue', linewidth=1.5),
                         capprops=dict(color='blue', linewidth=1.5),
                         flierprops=dict(marker='o', markerfacecolor='red', markersize=6, alpha=0.5))
        
        axes[idx].set_title(f'{col}\n({outlier_results[col]["num_outliers"]} outliers)', 
                           fontsize=10, fontweight='bold')
        axes[idx].set_ylabel('Value')
        axes[idx].grid(True, alpha=0.3)

# Hide any unused subplots
for idx in range(len(numerical_columns), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

print("\nBox plots created successfully!")

## Outlier Analysis Decision Summary

Based on the IQR outlier detection analysis:

### Decision: **RETAIN ALL OUTLIERS**

**Rationale:**
1. **Legitimate Extreme Values**: The outliers represent valid mobile phone specifications across different price ranges
   - Premium devices naturally have exceptional features (high RAM, storage, camera quality)
   - Budget devices have lower specifications
   - These variations are expected in the mobile phone market

2. **Natural Variation**: The outliers preserve the natural distribution of mobile phone specifications
   - High-end flagship phones with 1TB storage, 12GB+ RAM
   - Gaming phones with exceptional battery capacity
   - Ultra-budget phones with minimal specifications

3. **No Data Errors Detected**: 
   - All outlier values are within realistic ranges for mobile phones
   - No impossible values (e.g., negative battery capacity, 0g weight)
   - Values align with their respective price ranges

4. **Preserving Information**: Removing outliers would:
   - Eliminate important information about premium/budget segments
   - Reduce model's ability to predict across all price ranges
   - Create artificial boundaries that don't reflect real market conditions

### Conclusion
Valid extreme specifications are retained to ensure the dataset accurately represents the full spectrum of mobile phone specifications across all price ranges.