In [1]:
import pandas as pd
from sklearn import datasets, linear_model
import numpy as np
from matplotlib import pyplot as plt

from operator import itemgetter
from math import sqrt

In [2]:
seadata = pd.read_csv('sea-level-data.csv', encoding = 'ISO-8859-1').ix[1:, :]

In [3]:
seadata

Unnamed: 0,Year,1992,1992.1,1992.2,1992.3,1992.4,1992.5,1992.6,1992.7,1992.8,...,2016.2,2016.3,2016.4,2016.5,2016.6,2016.7,2016.8,2016.9,2016.10,2016.11
1,1,-247,-351,-300,-142,-45,-226,-340,-316,-97,...,-212,-130,-90,-155,-237,-161,-23,111,136,0
2,2,-438,-99999,-399,-195,-143,-255,-429,-441,-225,...,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,0
3,3,-455,-556,-458,-290,-165,-293,-458,-464,-254,...,-317,-184,-24,-122,-264,-210,-82,30,XXXXX,0
4,4,-447,-548,-487,-289,-197,-362,-523,-508,-267,...,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,0
5,5,-483,-587,-495,-318,-209,-364,-507,-483,-251,...,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,0
6,6,-250,-360,-284,-113,-15,-186,-339,-326,-107,...,-265,-121,-1,-86,-183,-114,21,95,109,0
7,7,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,...,-239,-86,14,-48,-182,-145,-22,37,100,0
8,8,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,...,-231,-98,14,-92,-205,-142,7,100,123,0
9,9,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,XXXXX,...,-217,-91,12,-81,-211,-126,26,144,145,0
10,10,-330,-381,-302,-183,-74,-199,-281,-275,-135,...,-234,-124,-58,-97,-137,-70,-13,52,27,0


In [4]:
seadata.iloc[0, 1]

'-247'

# this part is used to find similarities

In [5]:
#the approach to fill the missing data is:
#1 using Pearson product-moment correlation coefficient to find strongly linear relationship blcok
#find the linear regression equation between this two block
#use this equation to fill out missing data

In [6]:
def find_mean(row):
    row_sum = 0
    row_valid = 0
    for data in row:
        if (data == "-99999" or data == "XXXXX"):
            pass
        else:
            row_sum += int(data)
            row_valid += 1
    return row_sum / row_valid

In [7]:
def find_sd(row, mean):
    var_sum = 0
    for data in row:
        if (data == "-99999" or data == "XXXXX"):
            pass
        else:
            var_sum += (int(data) - mean)**2
    
    return sqrt(var_sum)

In [8]:
def similarities(block1, block2):
    #Pearson correlation coefficient
    row1 = seadata.iloc[block1 - 1, 1:]
    row2 = seadata.iloc[block2 - 1, 1:]
    
    row1_mean = find_mean(row1)
    row1_sd = find_sd(row1, row1_mean)
    row2_mean = find_mean(row2)
    row2_sd = find_sd(row2, row2_mean)
    
    uppersum = 0
    
    for data1, data2 in zip(row1, row2):
        if (data1 == "-99999" or data1 == "XXXXX" or data2 == "-99999" or data2 == "XXXXX"):
            pass
        else:
            uppersum += (int(data1) - row1_mean) * (int(data2) - row2_mean)
            
    return uppersum / (row1_sd * row2_sd)
            

In [9]:
def linear_regression(block1, block2):
    
    row1 = seadata.iloc[block1 - 1, 1:]
    row2 = seadata.iloc[block2 - 1, 1:]
    
    #row1 will be on the Y-axis and row2 will be on the X-axis
    #first input block as Y axis, second candidate block  as X axis
    x_axis = []
    y_axis = []
    
    #extract valid data
    for data1, data2 in zip(row1, row2):
        if (data1 == "-99999" or data1 == "XXXXX" or data2 == "-99999" or data2 == "XXXXX"):
            pass
        else:
            #first input block as Y axis, second candidate block  as X axis
            y_axis.append(int(data1))
            x_axis.append(int(data2))
    
    #find regression
    line = np.polyfit(x = x_axis, y = y_axis, deg = 1)
    
    return list(line)

In [10]:
def find_similar_block():
    #result = (block, [(block, scire)similar list])
    result = []
    
    for block in range(1,21):
        score_list = [] #append(blockB, similarities)

        for waiting_block in range(1, 21):
            score_list.append((similarities(block, waiting_block), 
                                waiting_block, 
                               linear_regression(block, waiting_block)))
            
        score_list.sort(reverse = True)
            
        result.append((block, score_list))
    
    return result    

In [11]:
result = find_similar_block()
result



[(1,
  [(1.0000000000000002, 1, [0.99999999999999978, 2.6254850544756922e-14]),
   (0.90142673779444, 3, [0.75190494977259004, 52.67809970832986]),
   (0.8930462391045597, 6, [0.77641802188284181, 0.30062167702055964]),
   (0.8286410687005241, 5, [0.75552726282544325, 137.18152900760921]),
   (0.7876223371642532, 10, [0.73321412708513889, -11.690810789699874]),
   (0.7831187832761317, 9, [0.84175180300513042, -27.900670544223136]),
   (0.7596569808385306, 15, [0.75183379902180503, -38.936523847497305]),
   (0.7374027552549586, 4, [0.79746790132422718, 165.49845156479543]),
   (0.710306450666294, 7, [0.71291120672747876, -11.943882752046552]),
   (0.6958421062149394, 14, [0.7497473908287513, -45.187016004925525]),
   (0.6660258670170521, 19, [0.72155446604387774, -81.019203721575408]),
   (0.5978887528869663, 18, [0.70324450397780036, -60.164852849723452]),
   (0.5868197248593428, 20, [0.61900324991795463, -98.344588353770874]),
   (0.5191297887862191, 8, [0.78728200467624099, -28.14722

In [46]:
def show_plot(block1, block2):
    #this function is to show the linear plot and scatter plot
    row1 = seadata.iloc[block1 - 1, 1:]
    row2 = seadata.iloc[block2 - 1, 1:]
    
    #row1 will be on the Y-axis and row2 will be on the X-axis
    #first input block as Y axis, second candidate block  as X axis
    x_axis = []
    y_axis = []
    
    #extract valid data
    for data1, data2 in zip(row1, row2):
        if (data1 == "-99999" or data1 == "XXXXX" or data2 == "-99999" or data2 == "XXXXX"):
            pass
        else:
            #first input block as Y axis, second candidate block  as X axis
            y_axis.append(int(data1)) #this is data from block1
            x_axis.append(int(data2)) #this is data from block2
    
    #find the linear regression
    line = np.polyfit(x = x_axis, y = y_axis, deg = 1)
    new_y = [line[0] * x + line[1] for x in x_axis]
    
    #drawing graph
    plt.figure()
    plt.scatter(x_axis, y_axis)
    plt.plot(x_axis, new_y)
    
    #adding title and save the file
    string = "Y-axis"+str(block1)+"  x-axis"+str(block2)+"  R ="+str(waiting_block[0])
    plt.title(string)
    Name = "Y-axis"+str(block1)+"  x-axis"+str(block2)
    plt.savefig(Name)
    plt.close()
    #plt.show()


In [47]:
def show_scatter_lineat(result):
    #this function is created to find the relationship between R, linear and scatter
    for data in result:
        main_block = data[0]
        for waiting_block in data[1][:5]:
            candidate_block = waiting_block[1]
            if main_block != candidate_block:
                
                show_plot(main_block, candidate_block)
                

In [48]:
show_scatter_lineat(result)



# The following part is to replace value of two group given the block group

In [4]:
def replace_mean_plan_B(block_list):
    #when there is no value of two block, we use the mean of the whole island
    block_index = [(block - 1) for block in block_list]
    
    for column in range(1,300):
        need_to_replace = []
        total_num_block = 0
        total_sea_level = 0
        mean_sea_level = 0
        for block in block_index:
            data = seadata.iloc[block, column]
            if data == "-99999" or data == "XXXXX":
                pass
            else:
                total_sea_level += int(data)
                total_num_block += 1
    
    mean_sea_level = total_sea_level / total_num_block
    return mean_sea_level

In [7]:
def replace_mean(block_list):
    block_index = [(block - 1) for block in block_list]
    
    for column in range(1,300):
        need_to_replace = []
        total_num_block = 0
        total_sea_level = 0
        mean_sea_level = 0
        for block in block_index:
            data = seadata.iloc[block, column]
            if data == "-99999" or data == "XXXXX":
                need_to_replace.append((block, column))
            else:
                total_sea_level += int(data)
                total_num_block += 1
                
        #in the case which there is no data on all columnm, we use the mean value of the whole island        
        if total_num_block == 0:
            total_sea_level = replace_mean_plan_B([x for x in range(1,21)])
            total_num_block = 1
            
        #calculte the mean sea-level based on the adjacent block
        mean_sea_level = total_sea_level / total_num_block
        
        #write data into the dataframe
        for (block, column) in need_to_replace:
            seadata.iloc[block, column] = mean_sea_level
            

# fill out the data

In [11]:
#first level that can get from original data
replace_mean([1, 2, 3])
replace_mean([4, 5, 6])
replace_mean([8, 9, 10])
replace_mean([16, 15])

#second level that based on first level's data
replace_mean([13,14,15])
replace_mean([19, 18, 15])
replace_mean([20,19, 18])
replace_mean([6,7,8])

#third level that based on second level's data
replace_mean([10, 13, 14, 15, 11, 12])
replace_mean([16,17,20])

In [12]:
seadata

Unnamed: 0,Year,1992,1992.1,1992.2,1992.3,1992.4,1992.5,1992.6,1992.7,1992.8,...,2016.2,2016.3,2016.4,2016.5,2016.6,2016.7,2016.8,2016.9,2016.10,2016.11
1,1,-247.0,-351.0,-300.0,-142.0,-45.0,-226.0,-340.0,-316.0,-97.0,...,-212.0,-130.0,-90.0,-155.0,-237.0,-161.0,-23.0,111.0,136.0,0
2,2,-438.0,-453.5,-399.0,-195.0,-143.0,-255.0,-429.0,-441.0,-225.0,...,-264.5,-157.0,-57.0,-138.5,-250.5,-185.5,-52.5,70.5,136.0,0
3,3,-455.0,-556.0,-458.0,-290.0,-165.0,-293.0,-458.0,-464.0,-254.0,...,-317.0,-184.0,-24.0,-122.0,-264.0,-210.0,-82.0,30.0,136.0,0
4,4,-447.0,-548.0,-487.0,-289.0,-197.0,-362.0,-523.0,-508.0,-267.0,...,-265.0,-121.0,-1.0,-86.0,-183.0,-114.0,21.0,95.0,109.0,0
5,5,-483.0,-587.0,-495.0,-318.0,-209.0,-364.0,-507.0,-483.0,-251.0,...,-265.0,-121.0,-1.0,-86.0,-183.0,-114.0,21.0,95.0,109.0,0
6,6,-250.0,-360.0,-284.0,-113.0,-15.0,-186.0,-339.0,-326.0,-107.0,...,-265.0,-121.0,-1.0,-86.0,-183.0,-114.0,21.0,95.0,109.0,0
7,7,-290.0,-370.5,-293.0,-148.0,-44.5,-192.5,-310.0,-300.5,-121.0,...,-239.0,-86.0,14.0,-48.0,-182.0,-145.0,-22.0,37.0,100.0,0
8,8,-330.0,-381.0,-302.0,-183.0,-74.0,-199.0,-281.0,-275.0,-135.0,...,-231.0,-98.0,14.0,-92.0,-205.0,-142.0,7.0,100.0,123.0,0
9,9,-330.0,-381.0,-302.0,-183.0,-74.0,-199.0,-281.0,-275.0,-135.0,...,-217.0,-91.0,12.0,-81.0,-211.0,-126.0,26.0,144.0,145.0,0
10,10,-330.0,-381.0,-302.0,-183.0,-74.0,-199.0,-281.0,-275.0,-135.0,...,-234.0,-124.0,-58.0,-97.0,-137.0,-70.0,-13.0,52.0,27.0,0


In [52]:
seadata.to_csv('Complete data set.csv')