### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import nltk
import os

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import warnings

# Loading Data

### Cities Responses

In [None]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values
    return summary

In [None]:
#Loading Cities response
df_ct_full_2018 = pd.read_csv("../input/cdp-unlocking-climate-solutions/Cities/Cities Responses/2018_Full_Cities_Dataset.csv")
df_ct_full_2019 = pd.read_csv("../input/cdp-unlocking-climate-solutions/Cities/Cities Responses/2019_Full_Cities_Dataset.csv")
df_ct_full_2020 = pd.read_csv("../input/cdp-unlocking-climate-solutions/Cities/Cities Responses/2020_Full_Cities_Dataset.csv")

### Corporations Disclosing

In [None]:
# Dataset_Corporates
df_cl_2018 = pd.read_csv("../input/cdp-unlocking-climate-solutions/Corporations/Corporations Disclosing/Climate Change/2018_Corporates_Disclosing_to_CDP_Climate_Change.csv")
df_cl_2019 = pd.read_csv("../input/cdp-unlocking-climate-solutions/Corporations/Corporations Disclosing/Climate Change/2019_Corporates_Disclosing_to_CDP_Climate_Change.csv")
df_cl_2020 = pd.read_csv("../input/cdp-unlocking-climate-solutions/Corporations/Corporations Disclosing/Climate Change/2020_Corporates_Disclosing_to_CDP_Climate_Change.csv")

### Joining Climate Change Data.

In [None]:
# Cities Responses
# concatenating
df = [df_ct_full_2018, df_ct_full_2019, df_ct_full_2020]
df_ct = pd.concat(df)

In [None]:
# resetting the index
df_ct = df_ct.reset_index()

In [None]:
resumetable(df_ct)

In [None]:
# Corporations Disclosing
# concatenating
df = [df_cl_2018, df_cl_2019, df_cl_2020]
df_cl = pd.concat(df)

In [None]:
# resetting the index
df_cl = df_cl.reset_index()

In [None]:
resumetable(df_cl)

# Exploratory analysis

* Cities Responses Data 

Year Reported to CDP

In [None]:
group = df_ct.groupby('Year Reported to CDP').size()
group.sort_values(ascending = False)

In [None]:
plt.figure(figsize=(15, 8))

freq = len(df_ct)

sns.set_palette("pastel")

g = sns.countplot(df_ct['Year Reported to CDP'])
g.set_xlabel('Year', fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

Region

In [None]:
group = df_ct.groupby('CDP Region').size()
group.sort_values(ascending = False)

In [None]:
plt.figure(figsize=(18, 8))

freq = len(df_ct)

sns.set_palette("pastel")

g = sns.countplot(df_ct['CDP Region'], order = df_ct['CDP Region'].value_counts().index)
g.set_xlabel('Region', fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

CDP Region by Year Reported to CDP

In [None]:
features = ["CDP Region", 'Year Reported to CDP']
cm = sns.light_palette("red", as_cmap = True)
pd.crosstab(df_ct[features[0]], df_ct[features[1]]).style.background_gradient(cmap = cm)

Country

In [None]:
group = df_ct.groupby('Country').size()
group.sort_values(ascending = False)

In [None]:
city_count = df_ct['Country'].value_counts()
city_count_10 = city_count[:10,]
city_count_10

In [None]:
plt.figure(figsize=(18, 8))

freq = len(df_ct)

sns.set_palette("pastel")

g = sns.barplot(city_count_10.index, city_count_10.values)
g.set_title('Top 10 Country', fontsize = 15)
g.set_xlabel('Region', fontsize = 15)
g.set_ylabel("Count", fontsize = 15)
plt.xticks(rotation = 90)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

Organization

In [None]:
group_ct = df_ct.groupby('Organization').size()
group_ct.sort_values(ascending = False)

In [None]:
ct_count = df_ct['Organization'].value_counts()
ct_count_10 = ct_count[:10,]
ct_count_10

In [None]:
plt.figure(figsize=(18, 8))

freq = len(df_ct)

sns.set_palette("pastel")

g = sns.barplot(ct_count_10.index, ct_count_10.values)
g.set_title('Top 10 City', fontsize = 15)
g.set_xlabel('City', fontsize = 15)
g.set_ylabel("Count", fontsize = 15)
plt.xticks(rotation=90)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

### Question
* 6.2 Does your city collaborate in partnership with businesses in your city on sustainability projects?

In [None]:
cities_6_2 = df_ct[df_ct['Question Number'] == '6.2'].rename(columns={'Organization': 'City'})

cities_6_2['Response Answer'] = df_ct['Response Answer'].fillna('No Response')

cities_6_2.head()

### Added data
Import a database to use the Code variable to build the map.

In [None]:
df_import = pd.read_csv('../input/countries-iso-codes/country_codes.csv').rename(columns={'COUNTRY': 'country'})

In [None]:
df_import

Checking country names

In [None]:
countries = df_import['country'].unique().tolist()
Number_of_countries = len(countries)
print(countries)
print("\nTotal countries df_import present: ",Number_of_countries)

In [None]:
countries = cities_6_2['Country'].unique().tolist()
Number_of_countries = len(countries)
print(countries)
print("\nTotal countries CDP_6.2 present: ",Number_of_countries)

It is important to check that the names of the countries in the two databases are the same.

In [None]:
rename = {
    'United States of America': 'United States',
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
    'Venezuela (Bolivarian Republic of)': 'Venezuela',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'China, Hong Kong Special Administrative Region': 'Hong Kong',
    'Taiwan, Greater China': 'Taiwan',
    'Viet Nam': 'Vietnam',
    'Democratic Republic of the Congo': 'Congo, Democratic Republic of the',
    'Russian Federation': 'Russia',
    'Republic of Korea': 'Korea, South',
    "Côte d'Ivoire": "Cote d'Ivoire",
    'United Republic of Tanzania': 'Tanzania',
    'Republic of Moldova': 'Moldova',
    'Chile': 'Chile'    
}

cities_6_2['re_country'] = cities_6_2['Country'].map(rename)
cities_6_2['re_country'] = cities_6_2['re_country'].fillna(cities_6_2['Country'])

In [None]:
countries_year = cities_6_2.groupby(['Year Reported to CDP',
                                     'CDP Region',
                                     're_country'])['City'].count().to_frame()
countries_year = countries_year.reset_index().rename(columns={'Year Reported to CDP': 'Year',
                                                              're_country': 'country'})

Reference:

https://www.kaggle.com/callumr22/cdp-starter-notebook

Joining the CDP database with df_import.

In [None]:
countries_year = pd.merge(countries_year, df_import, how='inner', on = 'country')

### Number of cities collaborate in partnership with businesses in your city on sustainability projects per year

In [None]:
df = countries_year
fig = px.scatter_geo(df, locations="CODE", color="CDP Region", 
                     hover_name="country", size="City",
                     animation_frame="Year", projection="natural earth")
fig.show()

In [None]:
countries_total = cities_6_2.groupby(['CDP Region','re_country'])['City'].count().to_frame()
countries_total = countries_total.reset_index().rename(columns={'re_country': 'country'})

In [None]:
countries_total = pd.merge(countries_total, df_import, how='inner', on = 'country')

### Total cities collaborate in partnership with companies in your city in sustainability projects

In [None]:
df = countries_total
fig = px.scatter_geo(df, locations="CODE", color="CDP Region",
                     hover_name="country", size="City",
                     projection="natural earth")
fig.show()

### Word Cloud for Question Name

In [None]:
cities_6_2.dropna(subset=['Question Name'], axis=0, inplace = True)

In [None]:
Response = cities_6_2['Question Name']

In [None]:
Response_summary = " ".join(s for s in Response)

In [None]:
stopwords=nltk.corpus.stopwords.words('english')

In [None]:
wordcloud = WordCloud(stopwords=stopwords,
                      background_color='white', width=1600,                            
                      height=800).generate(Response_summary)

In [None]:
fig, ax = plt.subplots(figsize=(16,8))       

ax.imshow(wordcloud, interpolation='bilinear') 
ax.set_axis_off()

plt.imshow(wordcloud)              
wordcloud.to_file('rafael.png',);

### Word Cloud for Response Answer

In [None]:
Response = cities_6_2['Response Answer']

In [None]:
Response_summary = " ".join(s for s in Response)

In [None]:
stopwords=nltk.corpus.stopwords.words('english', 'portuguese')

In [None]:
wordcloud = WordCloud(stopwords=stopwords,
                      background_color='white', width=1600,                            
                      height=800).generate(Response_summary)

In [None]:
fig, ax = plt.subplots(figsize=(16,8))       

ax.imshow(wordcloud, interpolation='bilinear') 
ax.set_axis_off()

plt.imshow(wordcloud)              
wordcloud.to_file('rafael.png',);

* Corporations Disclosing

Country

In [None]:
df_cl.groupby('country').size()

In [None]:
plt.figure(figsize=(12, 5))

freq = len(df_cl)

sns.set_palette("pastel")

g = sns.countplot(df_cl['country'])
g.set_xlabel('Country', fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

Year

In [None]:
df_cl.groupby('survey_year').size()

In [None]:
plt.figure(figsize=(12, 5))

freq = len(df_cl)

sns.set_palette("pastel")

g = sns.countplot(df_cl['survey_year'])
g.set_xlabel('Survey year', fontsize = 15)
g.set_ylabel("Count", fontsize = 15)

for p in g.patches:
    height = p.get_height()
    g.text(p.get_x() + p.get_width() / 2., height + 3,
          '{:1.2f}%'.format(height / freq * 100),
          ha = "center", fontsize = 18)

### To be continued...