In [1]:
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt


df  = pd.read_csv("laptopData.csv")

df = pd.DataFrame(df)

df = df.dropna(how = 'all')





if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)




df = df.replace('?', np.nan)

df['Inches'] = pd.to_numeric(df['Inches'], errors='coerce')


df['Weight'] = pd.to_numeric(df['Weight'].str.replace('kg','',regex=False),errors='coerce')



df['Ram'] = pd.to_numeric(df['Ram'].str.replace('GB', '', regex=False), errors='coerce')





median_weight = df['Weight'].median()
df['Weight'] = df['Weight'].fillna(median_weight)




inches_by_type = df.groupby('TypeName')['Inches'].median()

for type_name in df['TypeName'].unique():
    filled = (df['TypeName'] == type_name) & (df['Inches'].isnull())
    df.loc[filled, 'Inches'] = inches_by_type[type_name]




ram_by_group = df.groupby(['TypeName', 'Cpu', 'OpSys'])['Ram'].median()

for typename, cpu, opsys in ram_by_group.index:
    filled = ((df['TypeName'] == typename) & (df['Cpu'] == cpu) & (df['OpSys'] == opsys) & (df['Ram'].isnull()))
    df.loc[filled,'Ram'] = ram_by_group.loc[(typename , cpu , opsys)]

ram_by_cpu = df.groupby('Cpu')['Ram'].median()
df.loc[df['Ram'].isnull(), 'Ram'] = df.loc[df['Ram'].isnull(), 'Cpu'].map(ram_by_cpu)

ram_by_os = df.groupby('OpSys')['Ram'].median()
df.loc[df['Ram'].isnull(), 'Ram'] = df.loc[df['Ram'].isnull(), 'OpSys'].map(ram_by_os)

df['Ram'] = df['Ram'].fillna(df['Ram'].median())

def convert_memory(value):
    if pd.isna(value):
        return np.nan

    value = str(value).strip()


    if "+" in value:
        total = 0
        parts = value.split("+")
        for p in parts:
            p = p.strip()
            total += convert_memory(p)  
        return total


    if "TB" in value:
        num = value.replace("TB", "")
        num = ''.join(c for c in num if c.isdigit() or c == '.')
        return float(num) * 1024


    if "GB" in value:
        num = value.replace("GB", "")
        num = ''.join(c for c in num if c.isdigit() or c == '.')
        return float(num)


    if "Flash Storage" in value:
        num = ''.join([c for c in value if c.isdigit()])
        if num != "":
            return float(num)  # in GB
        else:
            return np.nan

    if "Hybrid" in value:
        num = ''.join([c for c in value if c.isdigit()])
        if num != "":
            return float(num)
        else:
            return np.nan


    return np.nan



df['Memory'] = df['Memory'].apply(convert_memory)


memory_by_group = df.groupby(['Ram', 'Cpu', 'OpSys'])['Memory'].median()

for ram,cpu,opsys in memory_by_group.index:
    filled = (
        (df['Ram']==ram)&
        (df['Cpu'] == cpu) &
        (df['OpSys'] == opsys) &
        (df['Memory'].isnull())
    )

    df.loc[filled, 'Memory'] = memory_by_group.loc[(ram, cpu, opsys)]


df['Memory'] = df['Memory'].fillna(df['Memory'].median())


price_by_group = df.groupby(['Company','TypeName','Cpu','OpSys','ScreenResolution'])['Price'].median()


for company, typename, cpu, opsys, screen in price_by_group.index:
    filled = (
        (df['Company']==company) &
        (df['TypeName']==typename) &
        (df['Cpu']==cpu) &
        (df['OpSys']==opsys) &
        (df['ScreenResolution']==screen) &
        (df['Price'].isnull())
    )
    df.loc[filled,'Price'] = price_by_group.loc[(company,typename,cpu,opsys,screen)]


price_by_tc = df.groupby(['TypeName','Cpu'])['Price'].median()

for typename, cpu in price_by_tc.index:
    filled = (
        (df['TypeName']==typename) &
        (df['Cpu']==cpu) &
        (df['Price'].isnull())
    )
    df.loc[filled,'Price'] = price_by_tc.loc[(typename,cpu)]


price_by_company = df.groupby('Company')['Price'].median()
df.loc[df['Price'].isnull(), 'Price'] = df.loc[df['Price'].isnull(), 'Company'].map(price_by_company)


df['Price'] = df['Price'].fillna(df['Price'].median())

missing_values = df.isnull().sum()

print("Missing Values in Each Column:")
print(missing_values)


print(f"Dataset shape: {df.shape}")

df.to_csv("laptopData_cleaned.csv", index=False)



Missing Values in Each Column:
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64
Dataset shape: (1273, 11)


In [2]:
from sklearn.model_selection import train_test_split

# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)


X = df.drop(columns = ["Price"])
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# df.info()

print(f"{X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_trin : {y_train.shape}")
print(f"y_test : {y_test.shape}")

# X_train.head()

# y_train.head()

print(X_train.head())
print(y_train.head())




# df.head()

(1018, 10)
X_test : (255, 10)
y_trin : (1018,)
y_test : (255,)
    Company            TypeName  Inches  \
502  Lenovo  2 in 1 Convertible    13.9   
416    Dell            Notebook    13.3   
162    Dell            Notebook    15.6   
667      HP            Notebook    15.6   
792  Lenovo  2 in 1 Convertible    13.3   

                              ScreenResolution                         Cpu  \
502  IPS Panel Full HD / Touchscreen 1920x1080  Intel Core i5 8250U 1.6GHz   
416                                   1366x768    Intel Core i3 6006U 2GHz   
162                          Full HD 1920x1080  Intel Core i5 8250U 1.6GHz   
667                          Full HD 1920x1080  Intel Core i7 7500U 2.7GHz   
792  IPS Panel Full HD / Touchscreen 1920x1080  Intel Core i7 7500U 2.7GHz   

     Ram  Memory                     Gpu       OpSys  Weight  
502  8.0   256.0  Intel UHD Graphics 620  Windows 10    1.40  
416  4.0   128.0   Intel HD Graphics 520  Windows 10    1.65  
162  8.0   256.0    

In [3]:
# non_intel_core = df.loc[~df['Cpu'].str.contains('Intel Core' , case = False , na = False)]
# len(non_intel_core)
# # print(non_intel_core)


In [4]:
# inches_by_type = df.groupby('TypeName')['Inches'].median()

# for type_name in df['TypeName'].unique():
#     filled = (df['TypeName'] == type_name) & (df['Inches'].isnull())
#     df.loc[filled, 'Inches'] = inches_by_type[type_name]

# memory_by_type  = df.groupby(["Company","TypeName","Ram"])


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score




df = pd.read_csv("employe.csv")

# df.head()

if "id" in df.columns:
    df = df.drop("id" ,axis=1)
    


# print(f"df.head \n  : {df.head}")    
# print(df.columns)

X = df.drop(columns=["CustomerId","Surname","Geography","Gender"])       
y = df["Exited"]


#  y is target variable
# x is  feature variable  



# print(f"X.head() \n : {X.head()}")




X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

model = RandomForestClassifier(
    n_estimators= 300,
    max_depth=None,
    min_samples_split= 2,
)


# Model Training


model.fit(X_train,y_train)

pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, pred))
# print("Accuracy X:", accuracy_score(X_test, pred))
# print(f"Accuracy : {accuracy_score(X_test, pred)}")


print(f"X_train : {X_train.shape}")
print(f"X_test  : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test  : {y_test.shape}")

# X_train.head()

# y_train.head()

# print(f"X_train.head() \n : {X_train.head()}")
# print(f"y_train.head() \n : {y_train.head()}")









Accuracy: 1.0
X_train : (132027, 9)
X_test  : (33007, 9)
y_train : (132027,)
y_test  : (33007,)


In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)


                           
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.predict([[0, 0, 0, 0]]))

[1]


Checking Model Accuracy and model metrics
F1
Precision
Recall
Zscore
accuracy - auc


In [11]:


CustomerId = int(input("Enter Customer ID: "))
CreditScore = int(input("Enter Credit Score: "))
Age = int(input("Enter Age: "))
Tenure = int(input("Enter Tenure: "))
Balance = int(input("Enter Balance: "))
NumOfProducts = int(input("Enter Number of Products: "))
HasCrCard = int(input("Enter Has Credit Card: "))
IsActiveMember = int(input("Enter Is Active Member: "))
EstimatedSalary = int(input("Enter Estimated Salary: "))

employee2d = [[CustomerId, CreditScore, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary]]

predictionn = model.predict(employee2d)

print("predictionn",predictionn)





predictionn [1]


