In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt


#This script is implemented according to the CRISP-DM process
###################################################Description###############################################################

#In this project we want to analyze data from series of tests in botany 
#without giving more precise information about which tests were carried out. 
#The question arises as to whether the experiments are statistically independent of one another.

##################################################DATA UNDERSTANDING#########################################################

#Starting by loading the data
df = pd.read_excel("TestData_2018_TS.xls")
#In addition we skip Unnamed Columns
df = df.drop(['Unnamed: 50','Unnamed: 51'], axis=1)
df.drop([55], inplace=True)
df.head(57)

Unnamed: 0,Testdata,2018,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49
0,Datum,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0
1,2018-12-28 00:00:00,,,,,,,,,,...,1.0,,,,,,,,,
2,2018-12-21 00:00:00,,,1.0,,,,,,,...,,,,,,,,,,1.0
3,2018-12-14 00:00:00,,,,1.0,,,,,,...,,,,,,,,,,
4,2018-12-07 00:00:00,,,,1.0,,,,,,...,,,1.0,,,,,,,
5,2018-11-30 00:00:00,,,,,,,,,,...,,1.0,,,,,,,,
6,2018-11-23 00:00:00,,,,,,,,,,...,,,,,,,1.0,,,
7,2018-11-16 00:00:00,,,,,,,,,,...,,,,,,1.0,,,,
8,2018-11-09 00:00:00,,,,,,,,1.0,,...,,,,,,,1.0,,,1.0
9,2018-11-02 00:00:00,,,,,1.0,,,,,...,,,1.0,,,,,,,


In [2]:
#We´re analysing the shape
column_nr = df.shape[1]
#Here we want to rename the rows and columns
str_year = "Data"
str_in = "Unnamed: "
str_out = "Reihe: "
#Renaming the columns
column_rename_row = [str_out+str(i) for i in range(column_nr-1)]
#Renaming the the first column
column_rename_row.insert(0,str_year)# changing columns using .columns() 
df.columns = column_rename_row
df.head()

Unnamed: 0,Data,Reihe: 0,Reihe: 1,Reihe: 2,Reihe: 3,Reihe: 4,Reihe: 5,Reihe: 6,Reihe: 7,Reihe: 8,...,Reihe: 39,Reihe: 40,Reihe: 41,Reihe: 42,Reihe: 43,Reihe: 44,Reihe: 45,Reihe: 46,Reihe: 47,Reihe: 48
0,Datum,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0
1,2018-12-28 00:00:00,,,,,,,,,,...,1.0,,,,,,,,,
2,2018-12-21 00:00:00,,,1.0,,,,,,,...,,,,,,,,,,1.0
3,2018-12-14 00:00:00,,,,1.0,,,,,,...,,,,,,,,,,
4,2018-12-07 00:00:00,,,,1.0,,,,,,...,,,1.0,,,,,,,


In [3]:
df.drop(df.index[:1], inplace=True)
df.head()

Unnamed: 0,Data,Reihe: 0,Reihe: 1,Reihe: 2,Reihe: 3,Reihe: 4,Reihe: 5,Reihe: 6,Reihe: 7,Reihe: 8,...,Reihe: 39,Reihe: 40,Reihe: 41,Reihe: 42,Reihe: 43,Reihe: 44,Reihe: 45,Reihe: 46,Reihe: 47,Reihe: 48
1,2018-12-28 00:00:00,,,,,,,,,,...,1.0,,,,,,,,,
2,2018-12-21 00:00:00,,,1.0,,,,,,,...,,,,,,,,,,1.0
3,2018-12-14 00:00:00,,,,1.0,,,,,,...,,,,,,,,,,
4,2018-12-07 00:00:00,,,,1.0,,,,,,...,,,1.0,,,,,,,
5,2018-11-30 00:00:00,,,,,,,,,,...,,1.0,,,,,,,,


In [4]:
df = df.fillna(0, axis=0)
df.head()

Unnamed: 0,Data,Reihe: 0,Reihe: 1,Reihe: 2,Reihe: 3,Reihe: 4,Reihe: 5,Reihe: 6,Reihe: 7,Reihe: 8,...,Reihe: 39,Reihe: 40,Reihe: 41,Reihe: 42,Reihe: 43,Reihe: 44,Reihe: 45,Reihe: 46,Reihe: 47,Reihe: 48
1,2018-12-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-12-21,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2018-12-14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-12-07,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2018-11-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
# Draw Plot
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Value', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:red')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()

plot_df(df, x=df.index, y=df.value, title='')    

AttributeError: 'DataFrame' object has no attribute 'value'

In [15]:
columns = list(df)
columns.pop(0)

for col in columns:
    #Converting the datatype
    df[col] = df[col].astype('float64')
    
df.head(10)

Unnamed: 0,Data,Reihe: 0,Reihe: 1,Reihe: 2,Reihe: 3,Reihe: 4,Reihe: 5,Reihe: 6,Reihe: 7,Reihe: 8,...,Reihe: 39,Reihe: 40,Reihe: 41,Reihe: 42,Reihe: 43,Reihe: 44,Reihe: 45,Reihe: 46,Reihe: 47,Reihe: 48
1,2018-12-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-12-21,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2018-12-14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-12-07,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2018-11-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2018-11-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,2018-11-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,2018-11-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
9,2018-11-02,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,2018-10-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df_new_data = df.copy()
np_array_temp = df.to_numpy()

for pos_i in range(np_array_temp.shape[0]):
    #for pos_j in range(np_array_temp.shape[1]):
        #df[str_out+str(pos_i)] = pos_i * df[str_out+str(pos_i)]
    print(df.at[pos_i+1] df.at[4, 'B'])
        #print(df[str_out+str(pos_j)])
        #print(pos_j * df[str_out+str(pos_i)])
        
#df["col"] = 2 * df["col"]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


In [6]:
np_array_temp = df.to_numpy()

for pos_i in range(np_array_temp.shape[0]):
    for pos_j in range(np_array_temp.shape[1]):
        #df[str_out+str(pos_i)] = pos_i * df[str_out+str(pos_i)]
        print (df[str(pos_i+1), str_out+str(pos_j)])
        #print (pos_j * df[str_out+str(pos_i)]) 

KeyError: ('1', 'Reihe: 0')

In [129]:
print(df["Reihe: 0","1"])

KeyError: ('Reihe: 0', '1')

In [74]:
np_array_temp.shape[0]

55

In [75]:
np_array_temp.shape[1]

50

In [92]:
df_new_data.head()

Unnamed: 0,Testdata,2018,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49
0,Datum,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,40.0,41.0,42.0,43.0,44.0,45.0,46.0,47.0,48.0,49.0
1,2018-12-28 00:00:00,,,,,,,,,,...,1.0,,,,,,,,,
2,2018-12-21 00:00:00,,,1.0,,,,,,,...,,,,,,,,,,1.0
3,2018-12-14 00:00:00,,,,1.0,,,,,,...,,,,,,,,,,
4,2018-12-07 00:00:00,,,,1.0,,,,,,...,,,1.0,,,,,,,


In [128]:
columns = list(df)
columns

['Data',
 'Reihe: 0',
 'Reihe: 1',
 'Reihe: 2',
 'Reihe: 3',
 'Reihe: 4',
 'Reihe: 5',
 'Reihe: 6',
 'Reihe: 7',
 'Reihe: 8',
 'Reihe: 9',
 'Reihe: 10',
 'Reihe: 11',
 'Reihe: 12',
 'Reihe: 13',
 'Reihe: 14',
 'Reihe: 15',
 'Reihe: 16',
 'Reihe: 17',
 'Reihe: 18',
 'Reihe: 19',
 'Reihe: 20',
 'Reihe: 21',
 'Reihe: 22',
 'Reihe: 23',
 'Reihe: 24',
 'Reihe: 25',
 'Reihe: 26',
 'Reihe: 27',
 'Reihe: 28',
 'Reihe: 29',
 'Reihe: 30',
 'Reihe: 31',
 'Reihe: 32',
 'Reihe: 33',
 'Reihe: 34',
 'Reihe: 35',
 'Reihe: 36',
 'Reihe: 37',
 'Reihe: 38',
 'Reihe: 39',
 'Reihe: 40',
 'Reihe: 41',
 'Reihe: 42',
 'Reihe: 43',
 'Reihe: 44',
 'Reihe: 45',
 'Reihe: 46',
 'Reihe: 47',
 'Reihe: 48']