# Part 1 - Original dataset

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv(r"C:\Users\Siddhant\Downloads\ninja_api_results1.csv")
df2 = pd.read_csv(r"C:\Users\Siddhant\Downloads\ninja_api_results2.csv")
df3 = pd.read_csv(r"C:\Users\Siddhant\Downloads\ninja_api_results3.csv")
df4 = pd.read_csv(r"C:\Users\Siddhant\Downloads\ninja_api_results4.csv")
df5 = pd.read_csv(r"C:\Users\Siddhant\Downloads\ninja_api_results5.csv")

In [3]:
merged_df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)

In [4]:
models_with_no_data = merged_df.groupby('Model')['Combination MPG'].apply(lambda x: all(x == 'No data found'))

count_models = models_with_no_data.sum()
print(count_models)

458


In [5]:
unique_model_count = merged_df['Model'].nunique()

# Print the count
print(unique_model_count)

1435


In [6]:
merged_df.head()

Unnamed: 0,Model,Model Year,Combination MPG
0,3 Series,2002,No data found
1,3 Series,2000,No data found
2,3 Series,2004,No data found
3,3 Series,2012,No data found
4,3 Series,2009,No data found


# Part 2 - Corrected MPG Values

In [7]:
df6 = pd.read_csv(r"C:\Users\Siddhant\Downloads\new_models_mpg1.csv")
df7 = pd.read_csv(r"C:\Users\Siddhant\Downloads\new_models_mpg2.csv")

merged_df_corrected = pd.concat([df6, df7], ignore_index=True)

In [8]:
merged_df_corrected.head()

Unnamed: 0,Model,ModelYear,Combination MPG
0,128i,2002,No data found
1,128i,2000,No data found
2,128i,2004,No data found
3,128i,2012,21
4,128i,2009,21


In [9]:
models_with_no_data = merged_df_corrected.groupby('Model')['Combination MPG'].apply(lambda x: all(x == 'No data found'))

count_models = models_with_no_data.sum()
print(count_models)

41


In [10]:
merged_df_corrected = merged_df_corrected.rename(columns={'ModelYear': 'Model Year'})

In [11]:
unique_model_count = merged_df_corrected['Model'].nunique()

# Print the count
print(unique_model_count)

202


In [12]:
merged_df_corrected.head()

Unnamed: 0,Model,Model Year,Combination MPG
0,128i,2002,No data found
1,128i,2000,No data found
2,128i,2004,No data found
3,128i,2012,21
4,128i,2009,21


# Part 3 - Corrected Model Name

In [13]:
df7 = pd.read_csv(r"C:\Users\Siddhant\Downloads\models_with_correct_names.csv")

df7.head()

Unnamed: 0,model,vin,model_api
0,1 Series,WBAUL7C58BVM80588,128i
1,2 Series,WBA1J9C53GV696599,M235i
2,24B,WP0CA218ADS114152,Not Found
3,25L,4S3BMGK63A3230154,Legacy
4,3 Series,WBAEV33452KL68783,325i


In [14]:
df7 = df7.rename(columns={'model': 'Model'})

In [15]:
unique_model_count = df7['Model'].nunique()

# Print the count
print(unique_model_count)

447


In [16]:
unique_model_count = df7['model_api'].nunique()

# Print the count
print(unique_model_count)

234


# Part 4 - Giving Original Model Name

In [17]:
merged_df_corrected = merged_df_corrected.rename(columns={'Model': 'model_api'})

In [18]:
merged_df_corrected.head()

Unnamed: 0,model_api,Model Year,Combination MPG
0,128i,2002,No data found
1,128i,2000,No data found
2,128i,2004,No data found
3,128i,2012,21
4,128i,2009,21


In [19]:
common_values = set(merged_df_corrected['model_api']).intersection(set(df7['model_api']))

# Count the number of common values
common_count = len(common_values)

# Print the result
print("Number of common 'model_api' values:", common_count)

Number of common 'model_api' values: 202


In [20]:
df8 = merged_df_corrected.merge(df7[['Model', 'model_api']], on='model_api', how='left')

In [21]:
df8.head()

Unnamed: 0,model_api,Model Year,Combination MPG,Model
0,128i,2002,No data found,1 Series
1,128i,2000,No data found,1 Series
2,128i,2004,No data found,1 Series
3,128i,2012,21,1 Series
4,128i,2009,21,1 Series


In [22]:
df8 = df8.drop(columns=['model_api'])

In [23]:
unique_model_count = df8['Model'].nunique()

# Print the count
print(unique_model_count)

334


In [24]:
models_with_no_data = df8.groupby('Model')['Combination MPG'].apply(lambda x: all(x == 'No data found'))

count_models = models_with_no_data.sum()
print(count_models)

73


# Part 5 - Adding corrected MPG values to original dataset

In [25]:
# Merge the two DataFrames on Model and Model Year
df = merged_df.merge(df8, on=['Model', 'Model Year'], how='left', suffixes=('', '_new'))

# Update values where 'Combination MPG' is 'No data found'
df['Combination MPG'] = df['Combination MPG'].where(df['Combination MPG'] != 'No data found', df['Combination MPG_new'])

# Drop the temporary column
df.drop(columns=['Combination MPG_new'], inplace=True)

In [26]:
df.head()

Unnamed: 0,Model,Model Year,Combination MPG
0,3 Series,2002,21
1,3 Series,2000,No data found
2,3 Series,2004,21
3,3 Series,2012,No data found
4,3 Series,2009,No data found


In [27]:
models_with_no_data = df.groupby('Model')['Combination MPG'].apply(lambda x: all(x == 'No data found'))

count_models = models_with_no_data.sum()
print(count_models)

71


In [28]:
df.head()

Unnamed: 0,Model,Model Year,Combination MPG
0,3 Series,2002,21
1,3 Series,2000,No data found
2,3 Series,2004,21
3,3 Series,2012,No data found
4,3 Series,2009,No data found


In [29]:
df.isnull().sum()

Model                  0
Model Year             0
Combination MPG    24346
dtype: int64

In [33]:
df['Combination MPG'] = df['Combination MPG'].fillna('No data found')

In [34]:
models_with_no_data = df.groupby('Model')['Combination MPG'].apply(lambda x: all(x == 'No data found'))

count_models = models_with_no_data.sum()
print(count_models)

201


In [35]:
count_no_data = (df['Combination MPG'] == 'No data found').sum()

print(count_no_data)

30490


In [45]:
df = df.sort_values(by=['Model', 'Model Year'], ascending=[True, True])

# Part 6 - Data Validation

In [46]:
def filter_data(model_value):
    """Returns rows matching the given model_api and my values."""
    return df[(df["Model"] == model_value) ]

In [47]:
model_input = "Terrain"  # Replace with actual model value

filtered_rows = filter_data(model_input)

In [50]:
filtered_rows.head(35)

Unnamed: 0,Model,Model Year,Combination MPG
124,Terrain,1993,No data found
122,Terrain,1995,No data found
114,Terrain,1996,No data found
107,Terrain,1997,No data found
111,Terrain,1998,No data found
108,Terrain,1999,No data found
97,Terrain,2000,No data found
110,Terrain,2001,No data found
96,Terrain,2002,No data found
116,Terrain,2003,No data found
