In [1]:
#LOCATION BASED ANALYSIS
#THIS CODE CONTAINS LOADING AND PREPARING THE DATA
import pandas as pd
import numpy as np
data = pd.read_csv(r'D:\PardivReddy_Cognifyz\content\Dataset .csv')
required_cols = ['Latitude', 'Longitude', 'City', 'Aggregate rating', 'Cuisines', 'Average Cost for two']
missing_cols = [col for col in required_cols if col not in data.columns]
if missing_cols:
    print(f"Missing columns: {missing_cols}. Please ensure Dataset.csv contains these.")
    data = data.dropna(subset=required_cols)
else:
    data = data[required_cols].dropna()
print("Data loaded. First 5 rows:")
print(data.head())

Data loaded. First 5 rows:
    Latitude   Longitude              City  Aggregate rating  \
0  14.565443  121.027535       Makati City               4.8   
1  14.553708  121.014101       Makati City               4.5   
2  14.581404  121.056831  Mandaluyong City               4.4   
3  14.585318  121.056475  Mandaluyong City               4.9   
4  14.584450  121.057508  Mandaluyong City               4.8   

                           Cuisines  Average Cost for two  
0        French, Japanese, Desserts                  1100  
1                          Japanese                  1200  
2  Seafood, Asian, Filipino, Indian                  4000  
3                   Japanese, Sushi                  1500  
4                  Japanese, Korean                  1500  


In [2]:
#VISUAL DISTRIBUTION ON MAP
import sys
!{sys.executable} -m pip install plotly
import plotly.express as px
import plotly.io as pio

pio.renderers.default = 'browser'

fig = px.scatter_mapbox(
    data,
    lat="Latitude",
    lon="Longitude",
    hover_data=["City", "Aggregate rating", "Cuisines"],
    color="Aggregate rating",
    size="Average Cost for two",
    zoom=10,
    height=600,
    mapbox_style="open-street-map",
    title="Restaurant Distribution by Location"
)
fig.show()





*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
def find_col_exists(df, tokens):
    tokens = [t.lower() for t in tokens]
    for c in df.columns:
        lc = c.lower()
        if all(tok in lc for tok in tokens):
            return c
    return None
features = []
c1 = find_col_exists(data, ['average','cost'])
if c1 is not None:
    features.append(c1)
c2 = find_col_exists(data, ['votes'])
if c2 is not None:
    features.append(c2)
c3 = find_col_exists(data, ['price','range'])
if c3 is not None:
    features.append(c3)
if not features:
    raise KeyError("No suitable feature columns found for modeling")
X = data[features].copy()
y_col = find_col_exists(data, ['aggregate','rating']) or find_col_exists(data, ['rating'])
if y_col is None:
    raise KeyError("Target column not found")
y = data[y_col].copy()
for col in X.columns:
    X[col] = pd.to_numeric(X[col].astype(str).str.replace(',','').str.replace('₹','').str.extract(r'([-+]?\d*\.?\d+)', expand=False), errors='coerce')
X = X.fillna(X.median())
y = pd.to_numeric(y.astype(str).str.replace(',',''), errors='coerce').fillna(y.mean())
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
coefficients = pd.DataFrame({'Feature': X.columns, 'Coefficient': model.coef_})
print(coefficients.sort_values(by='Coefficient', ascending=False))
with open(str(r'D:\PardivReddy_Cognifyz\content\task4_analysis.txt'), 'w', encoding='utf-8') as f:
    f.write("Task 1 Completion Report\n")
    f.write(f"Mean Squared Error: {mse}\n")
    f.write(f"R-squared Score: {r2}\n")
    f.write("Preprocessed data and model saved locally. Fixed string-to-float error.\n")
print("Progress saved to 'task4_analysis.txt'.")


                Feature  Coefficient
0  Average Cost for two     0.000005
Progress saved to 'task4_analysis.txt'.


In [5]:
#GROUP BY CITY AND ANALYZE CONCENTRATION
city_stats = data.groupby('City').agg({
    'Latitude': 'count',
    'Aggregate rating': 'mean',
    'Average Cost for two': 'mean'
}).rename(columns={'Latitude': 'Restaurant Count'})
city_stats = city_stats.sort_values('Restaurant Count', ascending=False)

print("\nRestaurant Concentration by City:")
print(city_stats)
if 'Locality' in data.columns:
    locality_stats = data.groupby('Locality').agg({
        'Latitude': 'count',
        'Aggregate rating': 'mean',
        'Average Cost for two': 'mean'
    }).rename(columns={'Latitude': 'Restaurant Count'})
    print("\nRestaurant Concentration by Locality (Top 10):")
    print(locality_stats.sort_values('Restaurant Count', ascending=False).head(10))


Restaurant Concentration by City:
                  Restaurant Count  Aggregate rating  Average Cost for two
City                                                                      
New Delhi                     5473          2.438845            596.088069
Gurgaon                       1118          2.651431            714.016100
Noida                         1080          2.036204            539.490741
Faridabad                      251          1.866932            447.609562
Ghaziabad                       25          2.852000            602.000000
...                            ...               ...                   ...
Trentham East                    1          4.100000             20.000000
Weirton                          1          3.900000             25.000000
Vineland Station                 1          4.300000             70.000000
Winchester Bay                   1          3.200000             25.000000
Yorkton                          1          3.300000             

In [6]:
#CALCULATE STATICTICS AND INSIGHTS
cuisine_counts = data.groupby(['City', 'Cuisines']).size().reset_index(name='Count')
top_cuisines = cuisine_counts.loc[cuisine_counts.groupby('City')['Count'].idxmax()]
print(top_cuisines)
insights = []
if city_stats['Restaurant Count'].max() > city_stats['Restaurant Count'].median() * 2:
    insights.append("High concentration of restaurants in a few cities.")
if city_stats['Aggregate rating'].std() > 0.5:
    insights.append("Significant variation in average ratings across cities.")
if data['Average Cost for two'].corr(data['Aggregate rating']) > 0.3:
    insights.append("Positive correlation between cost and rating.")
for insight in insights:
    print("-", insight)
with open(str(r'D:\PardivReddy_Cognifyz\content\task4_analysis.txt'), 'w', encoding='utf-8') as f:
    from datetime import date
    f.write(f"Task 4: Location-based Analysis Report - {date.today().isoformat()}\n")
    f.write("Restaurant Concentration by City:\n")
    f.write(city_stats.to_string() + "\n")
    f.write("Most Common Cuisine by City:\n")
    f.write(top_cuisines.to_string() + "\n")
    f.write("Insights:\n")
    f.write("\n".join(insights) + "\n")
print("Analysis saved to 'task4_analysis.txt'.")


                 City                                          Cuisines  Count
0           Abu Dhabi                                          American      2
28               Agra                             North Indian, Mughlai      5
32          Ahmedabad  Cafe, American, Continental, Armenian, Fast Food      1
64             Albany                            Japanese, Steak, Sushi      2
78          Allahabad                             North Indian, Chinese      3
...               ...                                               ...    ...
2993          Weirton                           Burger, Greek, Sandwich      1
2996  Wellington City                                              Cafe      3
3011   Winchester Bay                            Burger, Seafood, Steak      1
3012          Yorkton                                             Asian      1
3015        ��stanbul                                              Cafe      3

[140 rows x 3 columns]
- High concentration of rest