In [5]:


# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'swiggy-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4550138%2F7776386%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240401%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240401T080010Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5c54f7158b89d286d001517a03e0e7e84fd7e5020a3cd95624a4bfdf7b15fa5468ce3e31d4016c4dec4c8a2a633b82e234d5f676eab8017c0595a663885401975bc893dc8a932c8df52e2d35f8bc94716f80a6a2974bb03eda6f0f077e5f9591bf0c9b53bccb277ec6eafc854c08191ef3b4bfec793464fa554dbac7ab22a622486dc2a1b3f8bfe322d140e851453aa887a1d6c33c7882c26c4e0a76e7621a48f341a8f535bb0295a9d1d8bcdf23f02c9f0f4dc21d69ce65c494dfc65f8eafa77896af1e32525b7762d33c6e2b16a293c057271dd1b2f9f32c9f7c83f3bc68b25cb175fe79ea7f9a74067c5501dae214c7f87ec3bf9edadb8f2d48065d2d31d3'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')

Downloading swiggy-dataset, 74001 bytes compressed
Downloaded and uncompressed: swiggy-dataset
Data source import complete.


In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/swiggy-dataset/swiggy_scrap_uncleaned.csv
/kaggle/input/swiggy-dataset/swiggy_cleaned.csv


In [7]:
df = pd.read_csv('/kaggle/input/swiggy-dataset/swiggy_cleaned.csv')
df

Unnamed: 0,hotel_name,rating,time_minutes,food_type,location,offer_above,offer_percentage
0,McDonald's,4.5,27,"Burgers, Beverages, Cafe, Desserts",Kandivali East,75,30
1,KFC,4.2,30,"Burgers, Biryani, American, Snacks, Fast Food",Kandivali East,80,40
2,Domino's Pizza,4.3,25,"Pizzas, Italian, Pastas, Desserts",Thakur Village,299,not_available
3,Charcoal Eats - Biryani & Beyond,4.3,24,"Biryani, Kebabs, Hyderabadi, North Indian",Malad Kan East,100,50
4,Sandwizzaa,4.6,22,"Snacks, Fast Food, Beverages, Jain",Kandivali East,120,60
...,...,...,...,...,...,...,...
1744,Allspice,3.8,48,"North Indian, Chinese, Biryani, Tandoor",Kandivali west,700,20
1745,Poddar Cuisine,4.6,52,"Chinese, Biryani, Beverages",Malad Kan West,not_available,not_available
1746,JUG's Kitchen,4,48,"Indian, Lebanese, Mughlai, Chinese",Goregaon East,999,not_available
1747,Choco Magic Patisserie & Confectioners Studio,4.1,43,"Desserts, Beverages",Malad West,not_available,not_available


In [8]:
# to check the shape of dataset
df.shape

(1749, 7)

In [9]:
# to check null values in columns
df.isnull().sum()
# there are 343 null values

hotel_name            0
rating                0
time_minutes        343
food_type             0
location              0
offer_above           0
offer_percentage      0
dtype: int64

In [10]:
# to know the information about dataset
df.info()
# every column is string

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1749 entries, 0 to 1748
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hotel_name        1749 non-null   object
 1   rating            1749 non-null   object
 2   time_minutes      1406 non-null   object
 3   food_type         1749 non-null   object
 4   location          1749 non-null   object
 5   offer_above       1749 non-null   object
 6   offer_percentage  1749 non-null   object
dtypes: object(7)
memory usage: 95.8+ KB


In [11]:
# check unique values of time minutes
df['time_minutes'].unique()

array(['27', '30', '25', '24', '22', '32', '14', '13', '17', '34', '18',
       '15', '26', '16', '23', '29', '35', '33', '11-21', '36', '21',
       '20', '28', nan, '31', '16-26', '17-27', '19', '10-20', '18-28',
       '13-23', '22-32', '39', '7-17', '24-34', '40', '23-32', '37', '49',
       '38', '47', '43', '54', '45', '50', '42', '48', '46', '56', '44',
       '55', '51', '52', '70', '53', '64', '58', '65', '57', '80', '59',
       '41', '73', '67', '61', '66', '78', '60', '63', '12', '81', '62',
       '69', '68', '79', '75', '74', '25-35', '72', '71'], dtype=object)

In [12]:
# replacing double values with their mean and making it float column
df['time_minutes'] = df['time_minutes'].replace('11-21',16).replace('16-26',21).replace('17-27',22).replace('10-20',15).replace('18-28',23).replace('13-23',18).replace('22-32',27).replace('7-17',12).replace('24-34',29).replace('23-32',27).replace('25-35',30).astype(float)

In [13]:
# filling na values with mean and makin int column
df['time_minutes'] = df['time_minutes'].fillna(int(df['time_minutes'].mean())).astype(int)

In [14]:
df[df['rating'].str.contains('min')]
# i have done the mistake here time_minutes column values are located in rating column and replaced the na values with there mean

Unnamed: 0,hotel_name,rating,time_minutes,food_type,location,offer_above,offer_percentage
57,Dominic Pizza,33 mins,42,"Fast Food, Pizzas, Chinese, Italian, Pastas, D...",Pizza Story,100,50
114,Tiffin Box,27 mins,42,"Biryani, Beverages, North Eastern",Malad Kan East,75,30
134,Burger Hub,34 mins,42,"Burgers, Snacks, Chinese",Borivali,not_available,not_available
152,Ghar Ki Rasoi,32 mins,42,North Indian,Malad Kan East,not_available,not_available
172,Iceberg Organic Icecreams,36 mins,42,Ice Cream,Borivali,299,not_available
...,...,...,...,...,...,...,...
1698,Double Door Lounge,42 mins,42,"Chinese, Italian",Borivali,not_available,not_available
1701,I M Chinese Food Corner,37 mins,42,"Chinese, Snacks",Dahisar,not_available,not_available
1702,Mr.D Misal Pav,44 mins,42,Indian,Malad Kan West,not_available,not_available
1736,2 Pai cakes,52 mins,42,"Bakery, Beverages",Jog Gor West,not_available,not_available


In [15]:
temp_df = df[df['rating'].str.contains('min')]
temp_df

Unnamed: 0,hotel_name,rating,time_minutes,food_type,location,offer_above,offer_percentage
57,Dominic Pizza,33 mins,42,"Fast Food, Pizzas, Chinese, Italian, Pastas, D...",Pizza Story,100,50
114,Tiffin Box,27 mins,42,"Biryani, Beverages, North Eastern",Malad Kan East,75,30
134,Burger Hub,34 mins,42,"Burgers, Snacks, Chinese",Borivali,not_available,not_available
152,Ghar Ki Rasoi,32 mins,42,North Indian,Malad Kan East,not_available,not_available
172,Iceberg Organic Icecreams,36 mins,42,Ice Cream,Borivali,299,not_available
...,...,...,...,...,...,...,...
1698,Double Door Lounge,42 mins,42,"Chinese, Italian",Borivali,not_available,not_available
1701,I M Chinese Food Corner,37 mins,42,"Chinese, Snacks",Dahisar,not_available,not_available
1702,Mr.D Misal Pav,44 mins,42,Indian,Malad Kan West,not_available,not_available
1736,2 Pai cakes,52 mins,42,"Bakery, Beverages",Jog Gor West,not_available,not_available


In [16]:
x = temp_df['rating'].str.split(' ').str[0].values

In [17]:
# chenging the values of time minutes column
df.loc[temp_df.index,temp_df.columns[2]] = x

In [18]:
temp_df['rating'] = 'na'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['rating'] = 'na'


In [19]:
y = temp_df['rating'].values

In [20]:
df.loc[temp_df.index,temp_df.columns[1]] = y

In [21]:
df['offer_above'].unique()

array(['75', '80', '299', '100', '120', '150', 'not_available', '50',
       '149', '799', '99', '599', '449', '999', '749', '45', '699',
       'FREE ITEM', '40', '249', '169', '500', '700', '1500', '900',
       '499', '179', '600', '20% OFF', '110', '159', '1900', '1000',
       '899', '399', '129', '800', '125', '1999', '199', '1199', '1200',
       '349', '300', '1299', '200', '1300', '1099', '175', '350', '250',
       '65'], dtype=object)

In [22]:
df[df['offer_above'] ==  '20% OFF']

Unnamed: 0,hotel_name,rating,time_minutes,food_type,location,offer_above,offer_percentage
149,Natural Ice Cream,4.7,21,"Ice Cream, Desserts",Avenue Hotel,20% OFF,20


In [23]:
df.drop(index=[149],inplace=True)

In [24]:
df[df['offer_above'] ==  'FREE ITEM']

Unnamed: 0,hotel_name,rating,time_minutes,food_type,location,offer_above,offer_percentage
28,Nishi Home Delites,4.6,27,"Indian, Punjabi",Kandivali East,FREE ITEM,not_available
39,Shawarmaji,4.4,16,Lebanese,Thakur village Kandivali east,FREE ITEM,not_available
60,Saffron Restaurant,4.1,31,"North Indian, Chinese, Beverages, Tandoor, Kebabs",Kandivali East,FREE ITEM,not_available
196,Ladu Samrat,4.3,43,"Maharashtrian, Snacks, Street Food, Indian, De...",Kandivali East,FREE ITEM,not_available


In [25]:
df.drop(index=[28,39,60,196],inplace=True)

In [26]:
df['offer_above'].unique()

array(['75', '80', '299', '100', '120', '150', 'not_available', '50',
       '149', '799', '99', '599', '449', '999', '749', '45', '699', '40',
       '249', '169', '500', '700', '1500', '900', '499', '179', '600',
       '110', '159', '1900', '1000', '899', '399', '129', '800', '125',
       '1999', '199', '1199', '1200', '349', '300', '1299', '200', '1300',
       '1099', '175', '350', '250', '65'], dtype=object)

In [27]:
df['offer_percentage'].unique()

array(['30', '40', 'not_available', '50', '60', '20', '15', '10', '25',
       '35'], dtype=object)