In [10]:
import pandas as pd
from scipy import interpolate
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors

# Read CSV file and create DataFrame
df = pd.read_csv('waste.csv', usecols=["country", "year", "pollution_rate","Variable"], encoding = "ISO-8859-1", on_bad_lines="skip")
print(df)

        country                                      Variable  year  \
0     Australia          Public water supply - non-freshwater  2005   
1     Australia          Public water supply - non-freshwater  2010   
2     Australia          Public water supply - non-freshwater  2015   
3     Australia          Public water supply - non-freshwater  2016   
4     Australia          Public water supply - non-freshwater  2017   
...         ...                                           ...   ...   
6727   Colombia  Self- and other supply - Other manufacturing  2016   
6728   Colombia  Self- and other supply - Other manufacturing  2017   
6729   Colombia  Self- and other supply - Other manufacturing  2018   
6730   Colombia  Self- and other supply - Other manufacturing  2019   
6731   Colombia  Self- and other supply - Other manufacturing  2020   

      pollution_rate  
0                NaN  
1                NaN  
2            144.152  
3            148.908  
4            202.822  
...      

In [7]:
## missing_rows = df[df['pollution_rate'].isnull()]
# print(missing_rows)

# Group the data by 'country' and collect pollution rates
pollution_data = df.groupby('country')['pollution_rate'].apply(list).to_dict()


# Perform linear interpolation for each missing row
for index, row in missing_rows.iterrows():
    country = row['country']
    years = df[df['country'] == country]['year'].tolist()
    pollution_rates = pollution_data[country]
    missing_year = row['year']
    
    # Perform linear interpolation
    f = interpolate.interp1d(years, pollution_rates, kind='linear')
    interpolated_rate = f(missing_year)
    
    # Update the missing value with the interpolated rate
    df.at[index, 'pollution_rate'] = interpolated_rate

print("To see the similar countries first provide the year, for which you want to see the graph")
unique_years = df['year'].unique()
for year in unique_years:
    print(year)
selected_year = int(input("Enter the year: "))
# selected_year = 2005
filtered_df = df[df['year'] == selected_year]

print("To see the similar countries first provide the variable, for which you want to see the graph")
unique_variables = filtered_df['Variable'].unique()
for index, value in enumerate(unique_variables):
    print(f"Index: {index}, Variable: {value}")

selected_variable = unique_variables[int(input("Enter the Index: "))]
# selected_variable = unique_variables[60]
# print(selected_variable)

final_filtered_df = filtered_df[filtered_df['Variable'] == selected_variable]

pivot_table = final_filtered_df.pivot(index='country', columns='year', values='pollution_rate')
# print(len(pivot_table.index))
copy_table = pivot_table
if(len(pivot_table.index) <= 3 ): 
    print("We only have 3 countries for this criteria so result might not be accurate.")
    neigh = NearestNeighbors(n_neighbors=2)
    
else: 
    neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(pivot_table.values)
distances, indices = neigh.kneighbors(pivot_table.values)
# print(pivot_table)
countries = pivot_table.index
# print(pivot_table.loc["Belgium",2005])
for index, value in enumerate(countries):
    print(f"Country Number: {index}, Country: {value}")
selected_country = int(input("Enter the Country Number for which you want the Similar Countries: "))
print(f"Countries with similar pollution rates to {countries[selected_country]} with pollution rate {pivot_table.loc[countries[selected_country],selected_year]}  in {year} for {selected_variable}:")
for country_index, country in enumerate(pivot_table.index):
    
    if(country_index == selected_country):
        justlist = pivot_table.index[indices[country_index]].to_list()
        justlist.remove(country)
        
        # similar_countries = ', '.join(justlist)
        for each in justlist:
            print(each, "with pollution rate:", pivot_table.loc[each,selected_year])
        # print(f"{country}: {similar_countries}")
# print()




To see the similar countries first provide the year, for which you want to see the graph
2005
2010
2015
2016
2017
2018
2019
2020
2021
1980
1985
1990
1995
2000
1970


Enter the year:  2005


To see the similar countries first provide the variable, for which you want to see the graph
Index: 0, Variable: Public water supply - non-freshwater
Index: 1, Variable: Electricity production (cooling). Non-freshwater
Index: 2, Variable: Total abstractions of reused water
Index: 3, Variable: Abstractions for manufacturing. Reused water
Index: 4, Variable: Total losses during transport
Index: 5, Variable: Total gross abstraction non-freshwater
Index: 6, Variable: Abstractions for services - non-freshwater
Index: 7, Variable: Abstractions for agriculture, forestry, fishing. Non-freshwater
Index: 8, Variable: Abstractions for irrigation. Non-freshwater
Index: 9, Variable: Abstractions for manufacturing. Non-freshwater
Index: 10, Variable: Abstractions for cooling (manufacturing). Non-freshwater
Index: 11, Variable: Total water made available for use
Index: 12, Variable: Abstractions for hydro-electricity
Index: 13, Variable: Public water supply
Index: 14, Variable: Public water supply fo

Enter the Index:  60


year         2005
country          
Belgium      79.0
Lithuania     0.0
Netherlands   0.0
Slovenia      0.0
Country Number: 0, Country: Belgium
Country Number: 1, Country: Lithuania
Country Number: 2, Country: Netherlands
Country Number: 3, Country: Slovenia


Enter the Country Number for which you want the Similar Countries:  1


Countries with similar pollution rates to Lithuania with pollution rate 0.0  in 1970 for Evaporation losses:
Slovenia with pollution rate: 0.0
Netherlands with pollution rate: 0.0
