In [None]:
import pandas as pd
import numpy as nm
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from random import randint
from sys import version
from sklearn import svm
import warnings

warnings.filterwarnings("ignore")
#reads csv of aqi - Air Quality Index from 1-380

csvaqi = pd.read_csv('aqi.csv')
table = csvaqi.drop(columns = ['Date','Status'])
table = table.drop_duplicates(subset=['Country'], keep='last')
table.replace(['United States of America'],['United States'],inplace = True)
table.replace(['United Kingdom of Great Britain and Northern Ireland'],['United Kingdom'],inplace = True)
table.sort_values(by = ['Country'],inplace = True)
table.index = [i for i in range(len(list(table['Country'])))]

#reads csv of population of the world in 2018

csvpop = pd.read_csv('pop.csv')
pop = csvpop.drop(columns=['Land Area (Km²)', 'Migrants (net)','Net Change','Density (P/Km²)','Density (P/Km²)','Density (P/Km²)','Fert. Rate','Med. Age', 'Urban Pop %', 'World Share'])
pop.rename(columns={'Country (or dependency)':'Country'},inplace=True)
pop.drop_duplicates(subset=['Country'], keep='last',inplace=True)
pop.replace(['Côte d\'Ivoire'],['Ivory Coast'],inplace = True)
pop.replace(['Czech Republic (Czechia)'],['Czech Republic'],inplace = True)
pop.replace(['Cabo Verde'],['Cape Verde'],inplace = True)
pop.sort_values(by = ['Country'],inplace = True)
pop.index = [i for i in range(len(list(pop['Country'])))]
# a is values not in table - b is values not in pop
a = []
b = []

for i in range(len(list(table['Country']))):
    if list(table['Country'])[i] not in list(pop['Country']):b.append(i)
for i in range(len(list(pop['Country']))):
    if list(pop['Country'])[i] not in list(table['Country']):a.append(i)
table.drop(index=b,inplace=True)
pop.drop(index=a,inplace=True)


#everything is filtered with ~136 total entries -> converts pop in 2020 to pop in 2022 because aqi is in 2022


c = list(pop['Population (2020)'])
d = list(pop['Yearly Change'])

for i in range(len(c)):
    change = float(d[i].split()[0])
    value = int(c[i])
    for j in range(2):
        value += value*change
    c[i] = int(value)

#adds new pop(2022) to df and deletes other values so everything is in one df

pop['Population'] = c
pop['AQI Value'] = list(table['AQI Value'])
pop = pop.drop(columns=['Population (2020)'])
pop.index = [i for i in range(1,len(list(pop['Country']))+1)]

change = list(pop['Yearly Change'])
for i in range(len(change)):
    change[i] = float(change[i].split()[0])
pop['Yearly Change'] = change



#creates the input and output for both testing and training of the model
X = pop.drop(columns=['AQI Value','Country'])
y = pop['AQI Value']
inp,tinp,outp,toutp = train_test_split(X,y,test_size=.1)


#uses a linear regression algorithm

#calculates the r_squared for the linear regression

def r_squared(y,predicted):
    avg = nm.mean([int(i) for i in y])
    bottom = sum((int(i)-avg)**2 for i in y)
    top = sum((int(i)-avg)**2 for i in predicted)
    return "%.2f" % (top/bottom)

#finds the equation of the line

avgx = nm.mean([int(i) for i in list(pop['Population'])])
avgy = nm.mean([int(i) for i in list(pop['AQI Value'])])

slope = sum((a-avgx)*(b-avgy) for a,b in zip(list(pop['Population']),list(pop['AQI Value'])))/sum((a-avgx)**2 for a in list(pop['Population']))
intercept = avgy - avgx*slope


# the equation is now y = slope * x + intercept
linrpredictions = []

for i in list(pop['Population']):
    linrpredictions.append(slope*i+intercept)

rsquared = r_squared(list(pop['Population']),linrpredictions)


linPredictions = []
for i in toutp:
    linPredictions.append(int(i)*slope+intercept)



#uses a DecisionTree algorithm
tree = DTC()
tree.fit(inp,outp)
treePredictions = tree.predict(tinp)
treeaccuracy = accuracy_score(toutp,treePredictions)

#uses a LogisticRegression algorithm
log = LR(max_iter = 10000)
log.fit(inp,outp)
logPredictions = log.predict(tinp)
logaccuracy = accuracy_score(toutp,logPredictions)


#uses the SVM algorithm
svm = svm.SVC()
svm.fit(inp,outp)
svmPredictions = svm.predict(tinp)
svmaccuracy = accuracy_score(toutp,svmPredictions)

lintotal = 0 
treetotal = 0
logtotal = 0
svmtotal = 0
realtotal = 0

for i in range(len(toutp)):
    lintotal+=int(linPredictions[i])
    treetotal+=int(treePredictions[i])
    logtotal+=int(logPredictions[i])
    svmtotal+=int(svmPredictions[i])
    realtotal +=int(list(toutp)[i])
    
l = len(toutp)
diff = [(lintotal-realtotal)/l,(treetotal-realtotal)/l,(logtotal-realtotal)/l,(svmtotal-realtotal)/l]

#stores the avg difference between real and predicted values for all algorithms ( abs )
for i in range(len(diff)):
    diff[i] = abs(diff[i])

#starts the interactive user menu

def menu():
    print('a) Find AQI for a country')
    print('b) Find Population of a country')
    print('c) Enter AQI to find (a) matching countr(y)(ies)')
    print('d) Enter a Population and get an AQI range using different ML algorithms')
    print('q) Quit ')
    print()
    
    
def rand(num):
    array = []
    for i in range(num):
        val = pop._get_value(randint(1,len(pop['Country'])),'Country')
        if val in array: i-=1
        else: array.append(val)
    return array


def difference(test,real):
    a = 0
    while(len(test)<len(real)):
        test = test+' '
    while(len(test)>len(real)):
        real = real+' '
    for c,d in zip(test,real):
        if(c.lower()!=d.lower()):a+=1
    
    return a
    
def match(country):
    full = []
    partial = []
    
    for i in list(pop['Country']):
        i = i.lower()
        i = i.strip()
        country = country.lower().strip()
        for j in range(len(i)-2):
            for k in range(len(country)-2):
                if(i[j:j+3] == country[k:k+3] and cap(str(i)) not in partial and len(partial)<5):
                    partial.append(cap(str(i)))
        while(len(country)<len(i)):
            country = country + 'a'
        if(i==country ):
            full.append(cap(str(i)))
            break
        elif(difference(str(i),country)<=3 and cap(str(i)) not in partial and len(partial)<5):
            partial.append(cap(str(i)))
    
    if(len(partial)>0): return partial
    else: return ['None']
    

def getAQI(country):
    
    for i in range(len(list(pop['Country']))):
        if str(list(pop['Country'])[i]).lower() == country: 
            
            return list(pop['AQI Value'])[i]
def getPop(country):
    
    for i in range(len(list(pop['Country']))):
        if str(list(pop['Country'])[i]).lower() == country: 
            
            return list(pop['Population'])[i]
    
    
def cap(string):
    spl = string.split()
    return ' '.join(i[0].upper()+i[1:] for i in spl)

def aqirange(aqi):
    below = []
    no = []
    above = []
    below.extend(above)
    close = below
    for i in range(len(list(pop['AQI Value']))):
        if int(list(pop['AQI Value'])[i]) == aqi:
            no.append(str(list(pop['Country'])[i]))
        if int(list(pop['AQI Value'])[i]) == aqi-1:
            below.append(str(list(pop['Country'])[i]))
        if int(list(pop['AQI Value'])[i]) == aqi+1:
            above.append(str(list(pop['Country'])[i]))
    if(len(close)>=2):
        print('Countries that have AQI close(one on either side) to ' + str(aqi) + ': [' + ', '.join(close[i] for i in range(len(close)) if i<=5)+']')
        
    elif len(close)==1:
        print('The country that has an AQI close(one on either side) to ' + str(aqi) + ': [' + ', '.join(close[i] for i in range(len(close)) if i<=5)+']')
        
    if(len(no)>=2):
    
        print('Countries that have an AQI of ' + str(aqi) + ': [' + ', '.join(no[i] for i in range(len(no)) if i<=5)+']')
        
    elif len(no)==1:
        print('The country that has an AQI of ' + str(aqi) + ': [' + ', '.join(no[i] for i in range(len(no)) if i<=5)+']')
        
    if(len(no)==0 and len(close)==0):print('No Country has an AQI of ' + str(aqi))
        
def ml(pop):
    mlpredictions = [pop*slope+intercept,tree.predict([[pop,0.0]]),log.predict([[pop,0.0]]),svm.predict([[pop,0.0]])]
    print('Given a population of ' + str(pop)+', the following are the predictions of an AQI for 4 different algorithm models')
    print()
    print('Linear Regression: ' + str(mlpredictions[0]))
    print('This Linear Regression has a standard deviation of ' + str(diff[0]))
    print()
    print('DecisionTreeClassifier: ' + str(mlpredictions[1]))
    print('This DecisionTreeClassifier has a standard deviation of ' + str(diff[1]))
    print()
    print('Logistic Regression: ' + str(mlpredictions[2]))
    print('This Logistic Regression has a standard deviation of ' + str(diff[2]))
    print()
    print('Support Vector Machines: ' + str(mlpredictions[3]))
    print('This SVM has a standard deviation of ' + str(diff[3]))
    print()
user ='a'
while(user != 'quit' and user!='q' and user!='5'):
    menu()
    user = input()[0].lower()
    print()
    print()
    if(user=='1' or user=='a' ):
        country = 'a'
        while(country != 'quit' and country!='q'):
            print('Enter the name of a Country (q to quit)')
            print('Examples: '+', '.join(sorted(str(i) for i in rand(5))))
            print()
            country = input().lower()

            if(country != 'quit' and country!='q'):
                matches = match(country)
                if country in [str(i).lower() for i in list(pop['Country'])]: 
                    print(cap(country) + ' has an AQI of ' + str(getAQI(country)))
                    print()
                elif(matches[0]=='None'):
                    print('No entry matched ' + cap(country))
                    print()
                else:
                    print('No entry matched ' + cap(country))
                    print()
                    print('Did you mean any of the following? [' + ', '.join(str(i) for i in matches)+']')
                    print('Try Again')
                    print()
    elif(user=='2' or user=='b' ):
        country = 'a'
        while(country != 'quit' and country!='q'):
            print('Enter the name of a Country (q to quit)')
            print('Examples: '+', '.join(sorted(str(i) for i in rand(5))))
            print()
            country = input().lower()

            if(country != 'quit' and country!='q'):
                matches = match(country)
                if country in [str(i).lower() for i in list(pop['Country'])]: 
                    print(cap(country) + ' has an AQI of ' + str(getpop(country)))
                    print()
                elif(matches[0]=='None'):
                    print('No entry matched ' + cap(country))
                    print()
                else:
                    print('No entry matched ' + cap(country))
                    print()
                    print('Did you mean any of the following? [' + ', '.join(str(i) for i in matches)+']')
                    print('Try Again')
                    print()
        
    elif(user=='3' or user =='c'):
        aqi = ' '
        while(aqi!='q' and aqi!='quit'):
            print('Enter an AQI, an integer in range [1,380] (q to quit)')
            aqi = input()
            print()
            if(aqi!='q' and aqi !='quit'):
                try:
                    aqi = int(aqi)
                    print()
                    if(aqi>380 or aqi<=0):
                        print('That is not a valid AQI')
                        print('An AQI ranges from 1 to 380')
                        print()
                    else:
                        aqirange(aqi)
                        print()
                except:
                    print('That is not a valid AQI')
                    print('An AQI ranges is an integer')
                    print()
    elif(user=='d' or user =='4'):
        pop = ' '
        while(pop!='q' and pop!='quit'):
            print('Enter an integer representing population (q to quit)')
            pop = input()
            print()
            if(pop!='q' and pop !='quit'):
                try:
                    pop = int(pop)
                    print()
                    ml(pop)
                except:
                    print('That is not a valid population')
                    print('Enter an integer')
               
    
    print()


print('Thanks for using!')