# CSV Datatype Analysis

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the CSV file
df = pd.read_csv('Mobile.csv')
print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
# Display first few rows
df.head()

In [None]:
# Display datatypes of all columns
print("Column Datatypes:")
print(df.dtypes)

In [None]:
# Detailed information including datatypes, non-null counts, and memory usage
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Summary of datatypes
print("\nDatatype Summary:")
print(df.dtypes.value_counts())

In [None]:
# Convert string columns to integer by removing unit suffixes
# Integer conversions
df['Battery_power_mAh'] = df['Battery_power_mAh'].str.replace(' mAh', '').astype(int)
df['Ram_mb'] = df['Ram_mb'].str.replace(' mb', '').astype(int)
df['Internal_memeory_gb'] = df['Internal_memeory_gb'].str.replace(' gb', '').astype(int)
df['Primary_camera'] = df['Primary_camera'].str.replace(' pixels', '').astype(int)
df['Front_camera'] = df['Front_camera'].str.replace(' pixels', '').astype(int)
df['Mobile_weight'] = df['Mobile_weight'].str.replace(' g', '').astype(int)
df['px_height'] = df['px_height'].str.replace(' ppcm', '').astype(int)
df['Pixel_width'] = df['Pixel_width'].str.replace(' ppcm', '').astype(int)
df['Screen_height'] = df['Screen_height'].str.replace(' cm', '').astype(int)

# Float conversions
df['Mobile_depth'] = df['Mobile_depth'].str.replace(' cm', '').astype(float)

print("Data type conversions completed!")
print("\nUpdated datatypes:")
print(df.dtypes)

In [None]:
# Convert binary categorical features from Yes/No to 1/0
binary_columns = ['Bluetooh', 'Dual_sim', '4G', '3G', 'touch_screen', 'wifi']

for col in binary_columns:
    if col in df.columns:  # Check if column exists
        df[col] = df[col].map({'Yes': 1, 'No': 0})

print("Binary categorical conversions completed!")
print("\nConverted columns:")
for col in binary_columns:
    if col in df.columns:
        print(f"{col}: {df[col].unique()}")

In [None]:
# Remove screen_height, screen_weight, and 3G columns
columns_to_remove = ['Screen_height', 'Screen_weight', '3G']
df = df.drop(columns=columns_to_remove)

print(f"Columns removed: {columns_to_remove}")
print(f"New dataset shape: {df.shape}")
print(f"Remaining columns: {df.shape[1]}")

In [None]:
# Final Preprocessed Dataset Structure
print("="*80)
print("FINAL PREPROCESSED DATASET STRUCTURE")
print("="*80)
print()

# Dataset dimensions
print("Dataset Dimensions:")
print(f"  Total Records: {df.shape[0]}")
print(f"  Total Features: {df.shape[1]}")
print()

# Column information by category
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

binary_cols = ['Bluetooh', 'Dual_sim', '4G', 'touch_screen', 'wifi']
continuous_cols = [col for col in numerical_cols if col not in binary_cols]

print("Feature Categories:")
print(f"  Continuous Numerical: {len(continuous_cols)}")
print(f"  Binary (0/1): {len(binary_cols)}")
print(f"  Categorical (String): {len(categorical_cols)}")
print()

print("-"*80)
print("Continuous Numerical Features ({}):".format(len(continuous_cols)))
print("-"*80)
for i, col in enumerate(continuous_cols, 1):
    dtype = df[col].dtype
    min_val = df[col].min()
    max_val = df[col].max()
    print(f"{i:2d}. {col:25s} | Type: {str(dtype):7s} | Range: [{min_val:8.1f}, {max_val:8.1f}]")

print()
print("-"*80)
print("Binary Features ({}):".format(len(binary_cols)))
print("-"*80)
for i, col in enumerate(binary_cols, 1):
    if col in df.columns:
        dtype = df[col].dtype
        unique_vals = sorted(df[col].unique())
        print(f"{i}. {col:25s} | Type: {str(dtype):7s} | Values: {unique_vals}")

print()
print("-"*80)
print("Categorical Features ({}):".format(len(categorical_cols)))
print("-"*80)
if len(categorical_cols) > 0:
    for i, col in enumerate(categorical_cols, 1):
        dtype = df[col].dtype
        unique_count = df[col].nunique()
        print(f"{i}. {col:25s} | Type: {str(dtype):7s} | Unique values: {unique_count}")
        print(f"   Categories: {df[col].unique().tolist()}")
else:
    print("None")

print()
print("="*80)
print("Data Quality Check:")
print(f"  Missing values: {df.isnull().sum().sum()}")
print(f"  Duplicate rows: {df.duplicated().sum()}")
print("="*80)
print()
print("Preprocessing completed successfully\!")
print(f"Dataset is ready for analysis and modeling.")