In [11]:

# Phone Price EDA

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [12]:
# Load data
df = pd.read_csv("../data/my_phone.csv")
print(f"Dataset: {df.shape[0]} rows, {df.shape[1]} columns")


Dataset: 4020 rows, 12 columns


In [13]:
# Clean data
df = df[df['price_range'].notnull()].copy()
df = df.drop("Unnamed: 0", axis=1)
df['bluetooth'] = df['bluetooth'].replace({'NO': 0, 'YES': 1})
df['dual_sim'] = df['dual_sim'].replace({'NO': 0, 'YES': 1})
df['price_range'] = df['price_range'].round().astype(int)

print(f"After cleaning: {df.shape}")

After cleaning: (3020, 11)



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [14]:
# 1. Price distribution
fig = px.bar(df['price_range'].value_counts().sort_index(),
             title='Price Range Distribution',
             labels={'index': 'Price Range', 'value': 'Count'})
fig.show()

print("\nPrice counts:")


Price counts:


In [15]:
# 2. Basic stats
print("Basic stats:")
print(df.describe())

Basic stats:
       battery_power    bluetooth       weight       memory      n_cores  \
count    3020.000000  3020.000000  3020.000000  3020.000000  3020.000000   
mean     1087.287086     0.327815   139.398675    25.279801     3.843377   
std       415.928097     0.469495    33.282882    17.622524     2.184206   
min       501.000000     0.000000    80.000000     2.000000     1.000000   
25%       775.000000     0.000000   111.000000    11.000000     2.000000   
50%       843.000000     0.000000   139.000000    17.000000     4.000000   
75%      1429.000000     1.000000   167.000000    40.000000     5.000000   
max      1998.000000     1.000000   200.000000    64.000000     8.000000   

               ram         wifi  pixel_height  pixel_width  price_range  
count  3020.000000  3020.000000   3020.000000  3020.000000  3020.000000  
mean   1575.831457     0.500331    687.983444  1075.495695     1.331126  
std    1181.716841     0.500083    393.410172   458.947181     0.940227  
min   

In [16]:
# 3. Histograms
features = ['battery_power', 'ram', 'memory', 'weight', 'pixel_height', 'pixel_width']

for feature in features:
    fig = px.histogram(df, x=feature, title=f'{feature} distribution')
    fig.show()

In [17]:
# 4. Boxplots by price
for feature in features:
    fig = px.box(df, x='price_range', y=feature, title=f'{feature} by price')
    fig.show()

In [18]:
# 5. Correlation heatmap
corr = df.corr()
fig = px.imshow(corr, text_auto='.2f', title='Correlation Matrix')
fig.show()

print("\nCorrelation with price:")
print(corr['price_range'].sort_values(ascending=False))



Correlation with price:
price_range      1.000000
ram              0.826434
battery_power    0.295102
pixel_width      0.258094
memory           0.171308
bluetooth        0.142709
n_cores          0.113166
pixel_height     0.093849
wifi             0.019492
dual_sim         0.017631
weight          -0.016382
Name: price_range, dtype: float64


In [19]:
# 6. RAM by price (most important)
fig = px.histogram(df, x='ram', color='price_range',
                   title='RAM by Price Range',
                   barmode='overlay', opacity=0.7)
fig.show()


In [20]:
# 7. RAM vs Battery scatter
fig = px.scatter(df, x='ram', y='battery_power', color='price_range',
                title='RAM vs Battery Power')
fig.show()