<a href="https://colab.research.google.com/github/Vineet3693/mext-exam-_-projects/blob/main/Laptop_Product_Lineup_Strategy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load the dataset
# Note: You'll need to download the dataset from the provided Google Drive link
df = pd.read_csv('/content/synthetic_wtp_laptop_data.csv')

# First, let's explore the dataset structure
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nBasic statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Let's examine the actual column names and adjust our approach
print("\nSample of data:")
print(df.head(10))

# Based on the actual column names, let's map them properly
# This is a flexible approach that will work regardless of exact column names

def find_column_names(df):
    """
    Find the actual column names in the dataset
    """
    columns = df.columns.str.lower()

    # Try to find memory column
    memory_col = None
    for col in df.columns:
        if any(word in col.lower() for word in ['memory', 'ram', 'gb']):
            if 'memory' in col.lower() or 'ram' in col.lower():
                memory_col = col
                break

    # Try to find storage column
    storage_col = None
    for col in df.columns:
        if any(word in col.lower() for word in ['storage', 'disk', 'ssd', 'hdd']):
            storage_col = col
            break

    # Try to find CPU column
    cpu_col = None
    for col in df.columns:
        if any(word in col.lower() for word in ['cpu', 'processor', 'class']):
            cpu_col = col
            break

    # Try to find screen size column
    screen_col = None
    for col in df.columns:
        if any(word in col.lower() for word in ['screen', 'display', 'size', 'inch']):
            screen_col = col
            break

    # Try to find price column
    price_col = None
    for col in df.columns:
        if any(word in col.lower() for word in ['price', 'cost', 'yen', 'amount']):
            price_col = col
            break

    return memory_col, storage_col, cpu_col, screen_col, price_col

# Find actual column names
memory_col, storage_col, cpu_col, screen_col, price_col = find_column_names(df)

print(f"\nIdentified columns:")
print(f"Memory column: {memory_col}")
print(f"Storage column: {storage_col}")
print(f"CPU column: {cpu_col}")
print(f"Screen column: {screen_col}")
print(f"Price column: {price_col}")

# If automatic detection fails, let's manually inspect and set column names
if not all([memory_col, storage_col, cpu_col, screen_col, price_col]):
    print("\nCould not automatically detect all columns. Here are all column names:")
    for i, col in enumerate(df.columns):
        print(f"{i}: {col}")

    # You might need to manually set these based on the actual dataset
    # Uncomment and modify these lines based on the actual column names:
    # memory_col = df.columns[X]  # Replace X with the correct index
    # storage_col = df.columns[Y]  # Replace Y with the correct index
    # cpu_col = df.columns[Z]  # Replace Z with the correct index
    # screen_col = df.columns[A]  # Replace A with the correct index
    # price_col = df.columns[B]  # Replace B with the correct index

# Create a standardized dataframe with consistent column names
def create_standardized_df(df, memory_col, storage_col, cpu_col, screen_col, price_col):
    """
    Create a standardized dataframe with consistent column names
    """
    df_std = df.copy()

    # Rename columns to standard names
    column_mapping = {}
    if memory_col: column_mapping[memory_col] = 'memory_gb'
    if storage_col: column_mapping[storage_col] = 'storage_gb'
    if cpu_col: column_mapping[cpu_col] = 'cpu_class'
    if screen_col: column_mapping[screen_col] = 'screen_size'
    if price_col: column_mapping[price_col] = 'price'

    df_std = df_std.rename(columns=column_mapping)

    # Keep only the columns we need
    required_cols = ['memory_gb', 'storage_gb', 'cpu_class', 'screen_size', 'price']
    available_cols = [col for col in required_cols if col in df_std.columns]

    return df_std[available_cols]

# Create standardized dataframe
df_std = create_standardized_df(df, memory_col, storage_col, cpu_col, screen_col, price_col)

print(f"\nStandardized dataset shape: {df_std.shape}")
print(f"Standardized columns: {df_std.columns.tolist()}")
print(f"\nStandardized data sample:")
print(df_std.head())

# Clean and prepare the data
def prepare_data(df):
    """
    Clean the dataset by removing missing values and outliers
    """
    # Remove rows with missing values in key columns
    df_clean = df.dropna()

    # Convert columns to numeric if they're not already
    numeric_columns = ['memory_gb', 'storage_gb', 'cpu_class', 'screen_size', 'price']
    for col in numeric_columns:
        if col in df_clean.columns:
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

    # Remove any rows that couldn't be converted to numeric
    df_clean = df_clean.dropna()

    # Remove outliers based on price
    if 'price' in df_clean.columns and len(df_clean) > 10:
        Q1 = df_clean['price'].quantile(0.25)
        Q3 = df_clean['price'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean['price'] >= lower_bound) & (df_clean['price'] <= upper_bound)]

    return df_clean

df_clean = prepare_data(df_std)
print(f"\nCleaned dataset shape: {df_clean.shape}")

# Check if we have enough data to proceed
if len(df_clean) < 10:
    print("ERROR: Not enough clean data to build a reliable model.")
    print("Please check the dataset and column mappings.")
else:
    print("Dataset is ready for analysis!")

    # Current model specifications
    current_specs = {
        'memory_gb': 16,
        'storage_gb': 512,
        'cpu_class': 1,
        'screen_size': 14.0
    }

    # Upgrade options with their costs
    upgrade_options = {
        'memory': {'cost': 7000, 'change': {'memory_gb': 32}},  # Add 16GB (16+16=32)
        'storage': {'cost': 5000, 'change': {'storage_gb': 1024}},  # Add 512GB (512+512=1024)
        'cpu': {'cost': 15000, 'change': {'cpu_class': 2}},  # Upgrade by 1 level
        'screen': {'cost': 3000, 'change': {'screen_size': 16.0}}  # Increase to 16 inches
    }

    # Build a regression model to predict laptop prices
    def build_price_model(df_clean):
        # Features for the model
        feature_columns = [col for col in ['memory_gb', 'storage_gb', 'cpu_class', 'screen_size']
                          if col in df_clean.columns]

        if len(feature_columns) < 2:
            raise ValueError("Not enough feature columns available for modeling")

        X = df_clean[feature_columns]
        y = df_clean['price']

        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Train linear regression model
        model = LinearRegression()
        model.fit(X_scaled, y)

        # Print model performance
        y_pred = model.predict(X_scaled)
        r2 = r2_score(y, y_pred)
        print(f"\nModel R² score: {r2:.3f}")

        # Print feature importance
        print("\nFeature coefficients:")
        for i, col in enumerate(feature_columns):
            print(f"  {col}: {model.coef_[i]:.2f}")

        return model, scaler, feature_columns

    try:
        model, scaler, feature_columns = build_price_model(df_clean)

        # Predict price function
        def predict_price(specs, model, scaler, feature_columns):
            # Create feature vector using only available features
            features = np.array([[specs.get(col, 0) for col in feature_columns]])
            # Scale features
            features_scaled = scaler.transform(features)
            # Predict price
            price = model.predict(features_scaled)[0]
            return price

        # Calculate current model predicted price
        current_price_predicted = predict_price(current_specs, model, scaler, feature_columns)
        print(f"\nCurrent model predicted market price: {current_price_predicted:,.0f} yen")
        print(f"Current model selling price: {111000:,} yen")

        # Calculate gross profit for each upgrade option
        results = {}

        for option_name, option_data in upgrade_options.items():
            # Create upgraded specs
            upgraded_specs = current_specs.copy()
            upgraded_specs.update(option_data['change'])

            # Predict market price for upgraded model
            upgraded_price_predicted = predict_price(upgraded_specs, model, scaler, feature_columns)

            # Calculate price increase due to upgrade
            price_increase = upgraded_price_predicted - current_price_predicted

            # Calculate gross profit = price increase - additional cost
            gross_profit = price_increase - option_data['cost']

            results[option_name] = {
                'upgraded_specs': upgraded_specs,
                'predicted_price': upgraded_price_predicted,
                'price_increase': price_increase,
                'additional_cost': option_data['cost'],
                'gross_profit': gross_profit
            }

            print(f"\n{option_name.upper()} UPGRADE:")
            print(f"  Upgraded specs: {option_data['change']}")
            print(f"  Predicted market price: {upgraded_price_predicted:,.0f} yen")
            print(f"  Price increase: {price_increase:,.0f} yen")
            print(f"  Additional cost: {option_data['cost']:,} yen")
            print(f"  Gross profit: {gross_profit:,.0f} yen")

        # Rank options by gross profit
        ranked_options = sorted(results.items(), key=lambda x: x[1]['gross_profit'], reverse=True)

        print("\n" + "="*50)
        print("RANKING BY GROSS PROFIT:")
        print("="*50)

        for i, (option_name, data) in enumerate(ranked_options, 1):
            print(f"{i}. {option_name.upper()}: {data['gross_profit']:,.0f} yen")

        print("\n" + "="*50)
        print("TOP TWO UPGRADE OPTIONS:")
        print("="*50)

        top_two = ranked_options[:2]
        for i, (option_name, data) in enumerate(top_two, 1):
            print(f"{i}. {option_name.upper()}")
            print(f"   Gross profit: {data['gross_profit']:,.0f} yen")
            print(f"   Price increase: {data['price_increase']:,.0f} yen")
            print(f"   Additional cost: {data['additional_cost']:,} yen")

        # Final answer
        print(f"\nFINAL ANSWER: The two upgrade options with highest gross profit are:")
        print(f"1. {top_two[0][0].upper()}")
        print(f"2. {top_two[1][0].upper()}")

    except Exception as e:
        print(f"Error in model building: {e}")
        print("Please check the dataset structure and column names.")

Dataset shape: (1000, 6)

Column names:
['Memory', 'Storage', 'CPU_class', 'Screen_size', 'year', 'price']

First few rows:
   Memory  Storage  CPU_class  Screen_size  year   price
0       8     1024          3         16.0  2025  162021
1      32     1024          0         16.0  2024  112214
2      16      512          1         16.0  2025  118354
3      16     1024          1         16.0  2025  123383
4      16      256          3         16.0  2024  162968

Data types:
Memory           int64
Storage          int64
CPU_class        int64
Screen_size    float64
year             int64
price            int64
dtype: object

Basic statistics:
            Memory      Storage    CPU_class  Screen_size         year  \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
mean     18.704000   594.688000     1.502000    15.203200  2024.524000   
std      10.009024   319.767594     1.123948     0.864278     0.499674   
min       8.000000   256.000000     0.000000    14.000

