# Task: Descriptive Analysis

In [1]:
import pandas as pd
df = pd.read_csv('Dataset .csv')

In [2]:
# find missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


In [5]:
#handle missing values
df['Cuisines'].fillna(df['Cuisines'].mode()[0], inplace=True)

missing_values = df.isnull().sum()
print(missing_values)

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64


In [7]:
# Convert categorical columns to 'category' type
categorical_columns = [ "Restaurant Name", "City", "Address", "Locality", "Locality Verbose","Cuisines", "Currency", "Rating color", "Rating text" ]
df[categorical_columns] = df[categorical_columns].astype('category')

# Convert binary columns to boolean type (assuming "Yes"/"No" values)
binary_columns = ["Has Table booking", "Has Online delivery", "Is delivering now", "Switch to order menu"]
df[binary_columns] = df[binary_columns].map(lambda x: True if x == "Yes" else False)

# Verify the changes
print(df.dtypes)

Restaurant ID              int64
Restaurant Name         category
Country Code               int64
City                    category
Address                 category
Locality                category
Locality Verbose        category
Longitude                float64
Latitude                 float64
Cuisines                category
Average Cost for two       int64
Currency                category
Has Table booking           bool
Has Online delivery         bool
Is delivering now           bool
Switch to order menu        bool
Price range                int64
Aggregate rating         float64
Rating color            category
Rating text             category
Votes                      int64
dtype: object


### 1. Basic Statistical Measures for Numerical Columns

In [12]:
# Basic statistics for numerical columns
numerical_stats = df.describe()
print("Basic Statistical Measures:\n", numerical_stats)

# Mean for numerical columns
mean_values = df.mean(numeric_only=True)
print("\nMean values:\n",mean_values)

# Median for numerical columns
median_values = df.median(numeric_only=True)
print("\nMedian values:\n",median_values)

# Mode for numerical columns
mode_values = df.mode(numeric_only=True)
print("\nMode values:\n",mode_values)

# standard deviation for numerical columns
sd_values = df.std(numeric_only=True)
print("\nStandard deviation values:\n",sd_values)

# Variance for numerical columns
var_values = df.var(numeric_only=True)
print("\nVariance values:\n",var_values)

Basic Statistical Measures:
        Restaurant ID  Country Code    Longitude     Latitude  \
count   9.551000e+03   9551.000000  9551.000000  9551.000000   
mean    9.051128e+06     18.365616    64.126574    25.854381   
std     8.791521e+06     56.750546    41.467058    11.007935   
min     5.300000e+01      1.000000  -157.948486   -41.330428   
25%     3.019625e+05      1.000000    77.081343    28.478713   
50%     6.004089e+06      1.000000    77.191964    28.570469   
75%     1.835229e+07      1.000000    77.282006    28.642758   
max     1.850065e+07    216.000000   174.832089    55.976980   

       Average Cost for two  Price range  Aggregate rating         Votes  
count           9551.000000  9551.000000       9551.000000   9551.000000  
mean            1199.210763     1.804837          2.666370    156.909748  
std            16121.183073     0.905609          1.516378    430.169145  
min                0.000000     1.000000          0.000000      0.000000  
25%              25

### 2. Exploring the Distribution of Categorical Variables

In [13]:
# Distribution of categorical variables
country_distribution = df['Country Code'].value_counts()
city_distribution = df['City'].value_counts()
cuisines_distribution = df['Cuisines'].value_counts()

print("Country Code Distribution:\n", country_distribution)
print("\nCity Distribution:\n", city_distribution)
print("\nCuisines Distribution:\n", cuisines_distribution)

Country Code Distribution:
 Country Code
1      8652
216     434
215      80
30       60
214      60
189      60
148      40
208      34
14       24
162      22
94       21
184      20
166      20
191      20
37        4
Name: count, dtype: int64

City Distribution:
 City
New Delhi      5473
Gurgaon        1118
Noida          1080
Faridabad       251
Ghaziabad        25
               ... 
Dicky Beach       1
Lorn              1
Quezon City       1
Lincoln           1
Lakeview          1
Name: count, Length: 141, dtype: int64

Cuisines Distribution:
 Cuisines
North Indian                                             945
North Indian, Chinese                                    511
Chinese                                                  354
Fast Food                                                354
North Indian, Mughlai                                    334
                                                        ... 
Continental, Italian, North Indian, Lebanese, Thai         1
Contine

### 3. Top Cuisines and Cities with the Most Restaurants

In [14]:
# Top cuisines with the highest number of restaurants
top_cuisines = df['Cuisines'].value_counts().head(10)
print("Top 10 Cuisines:\n", top_cuisines)

# Top cities with the highest number of restaurants
top_cities = df['City'].value_counts().head(10)
print("Top 10 Cities with the Highest Number of Restaurants:\n", top_cities)

Top 10 Cuisines:
 Cuisines
North Indian                      945
North Indian, Chinese             511
Chinese                           354
Fast Food                         354
North Indian, Mughlai             334
Cafe                              299
Bakery                            218
North Indian, Mughlai, Chinese    197
Bakery, Desserts                  170
Street Food                       149
Name: count, dtype: int64
Top 10 Cities with the Highest Number of Restaurants:
 City
New Delhi       5473
Gurgaon         1118
Noida           1080
Faridabad        251
Ghaziabad         25
Bhubaneshwar      21
Lucknow           21
Amritsar          21
Ahmedabad         21
Guwahati          21
Name: count, dtype: int64
