In [41]:
import plotly.plotly as py     ## For plotting
import pandas as pd            ## For the data analysis

### Here, our aim is to plot many graphs describing the distribution of Malaria in Africa.
### To do so, we will use both libraries "pandas" to analyze the dataset, and "plotly", which
### will help us plot great interactive graphs on the African continent.


df = pd.read_csv('https://raw.githubusercontent.com/SebastianS09/Malaria/master/Data/Malaria.csv')
df.head()

df['text'] = df['Country'] + '<br>Year ' + (df['YeStart']).astype(str)  #Text written on the interactive map
limits = [(1,len(df))]            # We take all the dataset as values considered
colors = ["rgb(155,187,23)"]      # Green color (see "rgb color picker" on Google)


#Now that we decided which points are we going to study, we now decide how to plot them, with
#the function 'dict'. We comment all its components.
Cases_Malaria = dict(                   
    type = 'scattergeo',
    locationmode = 'africa',      # We choose Africa, so that we have the African continent plotted
    lon = df['Long'],             
    lat = df['Lat'],              #lon and lat are the coordinates of the points
    text = df['text'],            #text written when we select a point
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of Malaria') #legend

#Now, we are going to decide how to lay out our graph, still with function "dict".
    
graph = dict( 
        title = 'Geographical Distribution of Malaria in Africa',     #Title of the graph
        showlegend = True,
        geo = dict(
            scope='africa',
            showland = True,
            landcolor = 'rgb(217, 217, 217)',                       #Color of the continent (light grey)
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"                      #Color of the frontiers (white)
        ),
    )

firstgraph = dict( data=[Cases_Malaria], layout=graph )
py.iplot( firstgraph, validate=False, filename='Presence of Malaria in Africa' )  #Plot with plotly

In [38]:
import numpy as np 
import pandas as pd 

data_raw = pd.read_csv("https://raw.githubusercontent.com/SebastianS09/Malaria/master/Data/Malaria.csv")

print(data_raw.head(10))

###Data cleaning 

####Removing unecessary geographical precision and study information
data_raw.columns = [c.replace(' ', '_') for c in data_raw.columns]
col_rm = ['GAUL_Admin2','Full_Name','LatLong_Source','Source_Title']
data_rm = data_raw.drop(col_rm, axis=1)

print(data_rm.head(10))

#### Replacing Y and NaN with 0 and 1 for ease of understanding (col 7 to 33)

data_clean = data_rm.copy()

aneo = list(data_clean)[6:32]
data_clean[aneo] = data_clean[aneo].replace(['Y'],1)
data_clean[aneo] = data_clean[aneo].fillna(0)

print(data_clean.head(10))

####Check if other species are relevant 

other_f = data_clean['Other_Anopheline_species'].str.split(', ', expand=True)
other_f.fillna(0,inplace=True)

a = other_f[0].value_counts().to_frame()
for i in list(other_f.drop(0,axis=1)):
    a = a.join(other_f[i].value_counts().to_frame())
a.fillna(0,inplace=True)

a.sum(axis=1).sort_values(ascending = False)
data_clean[aneo].sum(axis=0).sort_values(ascending = False)

  Country GAUL_Admin1    GAUL_Admin2       Full_Name     Lat     Long  \
0  Angola       Bengo          Dande  Barra do Dande -8.4730  13.3620   
1  Angola       Bengo          Dande       Boa Vista -8.5990  13.5660   
2  Angola       Bengo          Dande         Cabungo -8.5667  13.5000   
3  Angola       Bengo          Dande          Lifune -8.4000  13.4490   
4  Angola       Bengo         Dembos          Dembos -8.5156  14.5189   
5  Angola       Bengo  Icolo E Bengo       Bom Jesus -9.1660  13.5660   
6  Angola       Bengo  Icolo E Bengo   Icolo e Bengo -9.2500  13.7333   
7  Angola       Bengo  Icolo E Bengo         Kilunda -8.9268  13.5878   
8  Angola       Bengo  Icolo E Bengo     Lagoa Banda -8.8503  13.5842   
9  Angola       Bengo  Icolo E Bengo          Mazozo -9.1020  13.6220   

  LatLong_Source  YeStart  YeEnd An gambiae_complex  \
0        Encarta     1967   1967                  Y   
1        Encarta     1987   1987                  Y   
2          Other     2002   200

An_gambiae_complex                        11596.0
An_funestus__s.l                           5095.0
An_gambiae_ss                              3990.0
An_arabiensis                              3648.0
An_coustani_s.l                            2692.0
An_pharoensis                              1902.0
SS_S_Form_(savanah_or_Bamako_forms)        1575.0
SS_M_Form_(An_colluzzi_or_Mopti_forms)     1331.0
An_squamous                                1294.0
An_rufipes                                  999.0
An_nili_s.l                                 823.0
An_funestus_s.s._(specified)                727.0
An_ziemanni_                                663.0
An._melas                                   538.0
An_moucheti_s.l                             502.0
An_mascarensis                              483.0
An_marshalli                                444.0
An_paludis_                                 376.0
An_hancocki                                 298.0
An_wellcomei                                272.0


In [42]:
import plotly.plotly as py
import pandas as pd

#Now, our goal is to plot all cases of Malaria from 1898 to 2016 regarding the decades where
#the cases have been discovered. For this sake, we need to sort the dataset according to YeStart,
#then build a function that gives the indexes for our subsets (explanation below).



# get_lim gives the indexes for which we have the same value in a dataset.
# Example of use : let x a data frame with sorted column  x.lon = [0,0,3,3,3,3,4,4] 
#getlim(x,lon) will give : [(0,2),(2,6),(6,7)]

def get_lim(df,col):                                
    val=df[col].unique()
    temp = []
    temp_s = [0]
    out = []
    for i in range(0,len(val)):
        temp.append(len(df[df[col]==val[i]]))
    for i in range(0,len(val)):
        temp_s.append(sum(temp[0:i+1]))
    for i in range(0,len(val)):
        out.append((temp_s[i],temp_s[i+1]))
    return(out)    

year_dat = data_raw[["YeStart"]]                 #We get rid of all useless columns in this case
year_dat = year_dat.sort_values(["YeStart"])     #We sort the dataset
a=get_lim(year_dat,"YeStart")                    #We get all "limits", that we will use 3 lines below   

df = pd.read_csv('https://raw.githubusercontent.com/SebastianS09/Malaria/master/Data/Malaria.csv')
df=df.sort_values(['YeStart'])

df['text'] = df['Country'] + '<br>Year ' + (df['YeStart']).astype(str)

a10=a[12][1]     #a[x][y] is equal to : 
a11=a10+1        #                     - If y=1 : a[x][1] gives the last index for the value x in the dataset
a30=a[32][1]
a31=a30+1        #                     - If y=0 : a[x][0] gives the first index for the values x
a50=a[52][1]
a51=a50+1
a70=a[72][1]     # For example, for 1910, since it is our 12th year of the dataset (starts at 1898)
a71=a70+1        # we write a[12] to find the first and final index for the year 1910.
a90=a[92][1]
a91=a90+1
a216=len(df)

#Now, let's put "limits" for each two-decades. Each two-decades will be assigned a color in the plot.

limits = [(1,a10),(a11,a30),(a31,a50),(a51,a70),(a71,a90),(a91,a216)]
# limits =[(1898,1910),(1911,1930),(1931,1950),(1951,1970),(1971,1990),(1991,2016)]
colors = ["rgb(0,116,217)","rgb(255,65,54)","rgb(133,20,75)","rgb(255,133,27)","rgb(110,133,58)","rgb(45,187,23)","lightgrey"]
all_cases_ever = []
names=['1890-1910','1911-1930','1931-1950','1951-1970','1971-1990','1991-2016'] 


for i in range(len(limits)):
    lim = limits[i]
    df_sub = df[lim[0]:lim[1]]
    mal_case = dict(
        type = 'scattergeo',
        locationmode = 'africa',
        lon = df_sub['Long'],
        lat = df_sub['Lat'],
        text = df_sub['text'],
        marker = dict(
            color = colors[i],
            line = dict(width=0.5, color='rgb(40,40,40)'),
            sizemode = 'area'
        ),
        
        name=names[i] )

        
    all_cases_ever.append(mal_case)

    
    
    
    
graph2 = dict(
        title = 'Distribution de la Malaria au fil des années',
        showlegend = True,
        geo = dict(
            scope='africa',
            projection=dict( type='africa' ),
            showland = True,
            landcolor = 'rgb(185, 185, 185)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(200, 200, 200)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

secondgraph = dict( data=all_cases_ever, layout=graph2 )
py.iplot( secondgraph, validate=False, filename='Presence_malaria_accross_decades' )

In [36]:
import plotly.plotly as py
import pandas as pd



####### VOIR ÇA AVANT DE RENDRE : POSSIBILITÉ D'ENLEVER TOUS LES LAYOUT POUR EN LAISSER QU'UN SEUL
####### EVITE RECOPIAGE DE CODE

#Now, our aim is to plot all malarian cases according to the type of anophilie, so as to compare
#which one of the types is the more inclined to be contaminated by Malaria.

#Let's start with the type An_paludis

df_paludis= data_clean[data_clean.An_paludis_==1]

df_paludis['text'] = df_paludis['Country'] + '<br>Year ' + (df_paludis['YeStart']).astype(str)
limits = [(1,len(df_paludis))]
colors = ["rgb(155,187,23)"]


palud = dict(
    type = 'scattergeo',
    locationmode = 'africa',
    lon = df_paludis['Long'],
    lat = df_paludis['Lat'],
    text = df_paludis['text'],
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of An_paludis')


    
graphpalud = dict(
        title = 'An_paludis distribution',
        showlegend = True,
        geo = dict(
            scope='africa',
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

figpalud = dict( data=[palud], layout=graphpalud )
py.iplot( figpalud, validate=False, filename='Presence-of-An-paludis' )




########################
#An_gambiae_complex

df_gambiaecomp= data_clean[data_clean.An_gambiae_complex==1]

df_gambiaecomp['text'] = df_gambiaecomp['Country'] + '<br>Year ' + (df_gambiaecomp['YeStart']).astype(str)
limits = [(1,len(df_gambiaecomp))]
colors = ["rgb(155,187,23)"]


gambiae_comp = dict(
    type = 'scattergeo',
    locationmode = 'africa',
    lon = df_gambiaecomp['Long'],
    lat = df_gambiaecomp['Lat'],
    text = df_gambiaecomp['text'],
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of An_gambiae_complex')


    
graphgambiae = dict(
        title = 'Distribution of An_gambiae',
        showlegend = True,
        geo = dict(
            scope='africa',
            #projection=dict( type='africa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

figgambiae = dict( data=[gambiae_comp], layout=graphgambiae )
py.iplot( figgambiae, validate=False, filename='Presence-of-An-gambiae' )



#########################
#An_coudtani

df_coustani= data_clean[data_clean.An_coustani==1]

df_coustani['text'] = df_coustani['Country'] + '<br>Year ' + (df_coustani['YeStart']).astype(str)
limits = [(1,len(df_coustani))]
colors = ["rgb(155,187,23)"]


coustani = dict(
    type = 'scattergeo',
    locationmode = 'africa',
    lon = df_coustani['Long'],
    lat = df_coustani['Lat'],
    text = df_coustani['text'],
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of An_coustani')


    
graphcoustani = dict(
        title = 'Distribution of An_coustani',
        showlegend = True,
        geo = dict(
            scope='africa',
            #projection=dict( type='africa' ),
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

figcoustani = dict( data=[coustani], layout=graphcoustani )
py.iplot( figcoustani, validate=False, filename='Presence-of-An-coustani' )


##################
#An_squamosus

df_squamosus= data_clean[data_clean.An_squamosus==1]

df_squamosus['text'] = df_squamosus['Country'] + '<br>Year ' + (df_squamosus['YeStart']).astype(str)
limits = [(1,len(df_squamosus))]
colors = ["rgb(155,187,23)"]


squamosus = dict(
    type = 'scattergeo',
    locationmode = 'africa',
    lon = df_squamosus['Long'],
    lat = df_squamosus['Lat'],
    text = df_squamosus['text'],
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of An_squamosus')


    
graphsquamosus = dict(
        title = 'Distribution of An_squamosus',
        showlegend = True,
        geo = dict(
            scope='africa',
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

figsquamosus = dict( data=[squamosus], layout=graphsquamosus )
py.iplot( figsquamosus, validate=False, filename='Presence-of-An-squamosus' )

##################
#An_arabiensis

df_arabiensis= data_clean[data_clean.An_arabiensis==1]

df_arabiensis['text'] = df_arabiensis['Country'] + '<br>Year ' + (df_arabiensis['YeStart']).astype(str)
limits = [(1,len(df_arabiensis))]
colors = ["rgb(155,187,23)"]


arabiensis = dict(
    type = 'scattergeo',
    locationmode = 'africa',
    lon = df_arabiensis['Long'],
    lat = df_arabiensis['Lat'],
    text = df_arabiensis['text'],
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of An_arabiensis')


    
grapharabiensis = dict(
        title = 'Distribution of An_arabiensis',
        showlegend = True,
        geo = dict(
            scope='africa',
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

figarabiensis = dict( data=[arabiensis], layout=grapharabiensis )
py.iplot( figarabiensis, validate=False, filename='Presence-of-An-arabiensis')



#####################
#An_funestus


df_funestus= data_clean[data_clean.An_funestus==1]

df_funestus['text'] = df_funestus['Country'] + '<br>Year ' + (df_funestus['YeStart']).astype(str)
limits = [(1,len(df_funestus))]
colors = ["rgb(155,187,23)"]


squamosus = dict(
    type = 'scattergeo',
    locationmode = 'africa',
    lon = df_funestus['Long'],
    lat = df_funestus['Lat'],
    text = df_funestus['text'],
    marker = dict(
        color = colors,
        line = dict(width=0.5, color='rgb(155,187,23)'),
        sizemode = 'area'
    ),
    name = 'Presence of An_funestus')


    
graphfunestus = dict(
        title = 'Distribution of An_funestus',
        showlegend = True,
        geo = dict(
            scope='africa',
            showland = True,
            landcolor = 'rgb(217, 217, 217)',
            subunitwidth=1,
            countrywidth=1,
            subunitcolor="rgb(255, 255, 255)",
            countrycolor="rgb(255, 255, 255)"
        ),
    )

figfunestus = dict( data=[funestus], layout=graphfunestus )
py.iplot( figfunestus, validate=False, filename='Presence-of-An-funestus' )




fig = tools.make_subplots(rows=3, cols=2, subplot_titles=('Plot 1', 'Plot 2',
                                                          'Plot 3', 'Plot 4',
                                                          'Plot 5', 'Plot 6'))

fig.append_trace(figpalud, 1, 1)
fig.append_trace(figgambiae, 1, 2)
fig.append_trace(figcoustani, 2, 1)
fig.append_trace(figsquamosus, 2, 2)
fig.append_trace(figarabiensis, 3, 1)
fig.append_trace(figfunestus, 3, 2)


fig['layout'].update(height=600, width=600, title='Multiple Subplots' +
                                                  ' with Titles')

py.iplot(fig, filename='make-subplots-multiple-with-titles')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



AttributeError: 'DataFrame' object has no attribute 'An_coustani'

In [43]:
list(df)

['Country',
 'GAUL_Admin1',
 'GAUL_Admin2',
 'Full_Name',
 'Lat',
 'Long',
 'LatLong_Source',
 'YeStart',
 'YeEnd',
 'An gambiae_complex',
 'An gambiae ss',
 'SS M Form (An colluzzi or Mopti forms)',
 'SS S Form (savanah or Bamako forms)',
 'An arabiensis',
 'An. melas',
 'An. merus',
 'An bwambae',
 'An funestus  s.l',
 'An funestus s.s. (specified)',
 'An rivulorum',
 'An leesoni',
 'An parensis',
 'An vaneedeni',
 'An nili s.l',
 'An moucheti s.l',
 'An pharoensis',
 'An hancocki',
 'An mascarensis',
 'An marshalli',
 'An squamous',
 'An wellcomei',
 'An rufipes',
 'An coustani s.l',
 'An ziemanni ',
 'An paludis ',
 'Adults/Larvae',
 'Sampling_Methods',
 'Species_Identification',
 'Other sib species names',
 'Other Anopheline species',
 'Source_Title',
 'text']