In [1]:
import pandas as pd

### Load the extension

In [2]:
%load_ext nl2pandas.magic.magic

### Import the dataset

In [3]:
# %nl2pandas import 'Kumpula-June-2016-w-metadata.txt'
data = pd.read_csv(filepath_or_buffer='Kumpula-June-2016-w-metadata.txt', skiprows=8)

In [4]:
# %nl2pandas show the first rows
data.head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN
0,20160601,65.5,73.6,54.7
1,20160602,65.8,80.8,55.0
2,20160603,68.4,,55.6
3,20160604,57.5,70.9,47.3
4,20160605,51.4,58.3,43.2


### Basic calculations
This section of the tutorial uses indexing methods which are not callable and therefore outside of the scope of this thesis

In [5]:
# Define a new column "DIFF"
data['DIFF'] = 0.0

# Print the dataframe
print(data)

    YEARMODA  TEMP   MAX   MIN  DIFF
0   20160601  65.5  73.6  54.7   0.0
1   20160602  65.8  80.8  55.0   0.0
2   20160603  68.4   NaN  55.6   0.0
3   20160604  57.5  70.9  47.3   0.0
4   20160605  51.4  58.3  43.2   0.0
5   20160606  52.2  59.7  42.8   0.0
6   20160607  56.9  65.1  45.9   0.0
7   20160608  54.2   NaN  47.5   0.0
8   20160609  49.4  54.1  45.7   0.0
9   20160610  49.5  55.9  43.0   0.0
10  20160611  54.0  62.1  41.7   0.0
11  20160612  55.4  64.2  46.0   0.0
12  20160613  58.3  68.2  47.3   0.0
13  20160614  59.7  67.8  47.8   0.0
14  20160615  63.4  70.3  49.3   0.0
15  20160616  57.8  67.5  55.6   0.0
16  20160617  60.4  70.7  55.9   0.0
17  20160618  57.3   NaN  54.0   0.0
18  20160619  56.3  59.2  54.1   0.0
19  20160620  59.3  69.1  52.2   0.0
20  20160621  62.6  71.4  50.4   0.0
21  20160622  61.7  70.2  55.4   0.0
22  20160623  60.9  67.1  54.9   0.0
23  20160624  61.1  68.9  56.7   0.0
24  20160625  65.7  75.4  57.9   0.0
25  20160626  69.6  77.7  60.3   0.0
2

In [6]:
# Check datatypes
data['DIFF'].dtypes

dtype('float64')

In [7]:
#Calculate max min difference
data['DIFF'] = data['MAX'] - data['MIN']

# Check the result
print(data.head())

   YEARMODA  TEMP   MAX   MIN  DIFF
0  20160601  65.5  73.6  54.7  18.9
1  20160602  65.8  80.8  55.0  25.8
2  20160603  68.4   NaN  55.6   NaN
3  20160604  57.5  70.9  47.3  23.6
4  20160605  51.4  58.3  43.2  15.1


In [8]:
# Calculate difference between temp and min column values
data['DIFF_MIN'] = data['TEMP'] - data['MIN']

# Print the dataframe
print(data)

    YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN
0   20160601  65.5  73.6  54.7  18.9      10.8
1   20160602  65.8  80.8  55.0  25.8      10.8
2   20160603  68.4   NaN  55.6   NaN      12.8
3   20160604  57.5  70.9  47.3  23.6      10.2
4   20160605  51.4  58.3  43.2  15.1       8.2
5   20160606  52.2  59.7  42.8  16.9       9.4
6   20160607  56.9  65.1  45.9  19.2      11.0
7   20160608  54.2   NaN  47.5   NaN       6.7
8   20160609  49.4  54.1  45.7   8.4       3.7
9   20160610  49.5  55.9  43.0  12.9       6.5
10  20160611  54.0  62.1  41.7  20.4      12.3
11  20160612  55.4  64.2  46.0  18.2       9.4
12  20160613  58.3  68.2  47.3  20.9      11.0
13  20160614  59.7  67.8  47.8  20.0      11.9
14  20160615  63.4  70.3  49.3  21.0      14.1
15  20160616  57.8  67.5  55.6  11.9       2.2
16  20160617  60.4  70.7  55.9  14.8       4.5
17  20160618  57.3   NaN  54.0   NaN       3.3
18  20160619  56.3  59.2  54.1   5.1       2.2
19  20160620  59.3  69.1  52.2  16.9       7.1
20  20160621 

In [9]:
# Create a new column and convert temp fahrenheit to celsius:
data['TEMP_CELSIUS'] = (data['TEMP'] - 32) / (9/5)

# %nl2pandas show first rows
data.head()

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS
0,20160601,65.5,73.6,54.7,18.9,10.8,18.611111
1,20160602,65.8,80.8,55.0,25.8,10.8,18.777778
2,20160603,68.4,,55.6,,12.8,20.222222
3,20160604,57.5,70.9,47.3,23.6,10.2,14.166667
4,20160605,51.4,58.3,43.2,15.1,8.2,10.777778


### Selecting rows and columns
This section of the tutorial uses indexing methods such as loc and iloc, which are not callable and therefore outside of the scope of this thesis

### Filtering and updating data
This section of the tutorial uses indexing methods which are not callable and therefore outside of the scope of this thesis

In [10]:
# Select rows with temp celsius higher than 15 degrees from late June 2016
warm_temps = data.loc[(data['TEMP_CELSIUS'] > 15) & (data['YEARMODA'] >= 20160615)]
print(warm_temps)

    YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS
14  20160615  63.4  70.3  49.3  21.0      14.1     17.444444
16  20160617  60.4  70.7  55.9  14.8       4.5     15.777778
19  20160620  59.3  69.1  52.2  16.9       7.1     15.166667
20  20160621  62.6  71.4  50.4  21.0      12.2     17.000000
21  20160622  61.7  70.2  55.4  14.8       6.3     16.500000
22  20160623  60.9  67.1  54.9  12.2       6.0     16.055556
23  20160624  61.1  68.9  56.7  12.2       4.4     16.166667
24  20160625  65.7  75.4  57.9  17.5       7.8     18.722222
25  20160626  69.6  77.7  60.3  17.4       9.3     20.888889
26  20160627  60.7  70.0   NaN   NaN       NaN     15.944444
27  20160628  65.4  73.0  55.8  17.2       9.6     18.555556
28  20160629  65.8  73.2   NaN   NaN       NaN     18.777778
29  20160630  65.7  72.7  59.2  13.5       6.5     18.722222


In [11]:
# %nl2pandas reset the index
warm_temps = warm_temps.reset_index(drop=True)

In [12]:
print(warm_temps)

    YEARMODA  TEMP   MAX   MIN  DIFF  DIFF_MIN  TEMP_CELSIUS
0   20160615  63.4  70.3  49.3  21.0      14.1     17.444444
1   20160617  60.4  70.7  55.9  14.8       4.5     15.777778
2   20160620  59.3  69.1  52.2  16.9       7.1     15.166667
3   20160621  62.6  71.4  50.4  21.0      12.2     17.000000
4   20160622  61.7  70.2  55.4  14.8       6.3     16.500000
5   20160623  60.9  67.1  54.9  12.2       6.0     16.055556
6   20160624  61.1  68.9  56.7  12.2       4.4     16.166667
7   20160625  65.7  75.4  57.9  17.5       7.8     18.722222
8   20160626  69.6  77.7  60.3  17.4       9.3     20.888889
9   20160627  60.7  70.0   NaN   NaN       NaN     15.944444
10  20160628  65.4  73.0  55.8  17.2       9.6     18.555556
11  20160629  65.8  73.2   NaN   NaN       NaN     18.777778
12  20160630  65.7  72.7  59.2  13.5       6.5     18.722222


### Dealing with missing data

In [13]:
# %nl2pandas drop missing data from 'MIN'
warm_temps.dropna(subset='MIN')

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS
0,20160615,63.4,70.3,49.3,21.0,14.1,17.444444
1,20160617,60.4,70.7,55.9,14.8,4.5,15.777778
2,20160620,59.3,69.1,52.2,16.9,7.1,15.166667
3,20160621,62.6,71.4,50.4,21.0,12.2,17.0
4,20160622,61.7,70.2,55.4,14.8,6.3,16.5
5,20160623,60.9,67.1,54.9,12.2,6.0,16.055556
6,20160624,61.1,68.9,56.7,12.2,4.4,16.166667
7,20160625,65.7,75.4,57.9,17.5,7.8,18.722222
8,20160626,69.6,77.7,60.3,17.4,9.3,20.888889
10,20160628,65.4,73.0,55.8,17.2,9.6,18.555556


In [14]:
# %nl2pandas fill the missing values with -9999
warm_temps.fillna(value=-9999)

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS
0,20160615,63.4,70.3,49.3,21.0,14.1,17.444444
1,20160617,60.4,70.7,55.9,14.8,4.5,15.777778
2,20160620,59.3,69.1,52.2,16.9,7.1,15.166667
3,20160621,62.6,71.4,50.4,21.0,12.2,17.0
4,20160622,61.7,70.2,55.4,14.8,6.3,16.5
5,20160623,60.9,67.1,54.9,12.2,6.0,16.055556
6,20160624,61.1,68.9,56.7,12.2,4.4,16.166667
7,20160625,65.7,75.4,57.9,17.5,7.8,18.722222
8,20160626,69.6,77.7,60.3,17.4,9.3,20.888889
9,20160627,60.7,70.0,-9999.0,-9999.0,-9999.0,15.944444


### Data type conversions

In [15]:
#Original values:
# %nl2pandas show first rows
warm_temps["TEMP"].head()

0    63.4
1    60.4
2    59.3
3    62.6
4    61.7
Name: TEMP, dtype: float64

In [16]:
#Truncated integer values:
# %nl2pandas convert 'TEMP' to type 'int'
data["TEMP"].astype(dtype='int').head()

0    65
1    65
2    68
3    57
4    51
Name: TEMP, dtype: int32

In [23]:
# %nl2pandas round 'TEMP' to 0 decimals
# edited to concatinate methods
data["TEMP"].round().astype(int).head()

0    66
1    66
2    68
3    58
4    51
Name: TEMP, dtype: int32

### Unique values

In [18]:
# %nl2pandas get unique values from 'TEMP'
unique = data["TEMP"].unique()
unique

array([65.5, 65.8, 68.4, 57.5, 51.4, 52.2, 56.9, 54.2, 49.4, 49.5, 54. ,
       55.4, 58.3, 59.7, 63.4, 57.8, 60.4, 57.3, 56.3, 59.3, 62.6, 61.7,
       60.9, 61.1, 65.7, 69.6, 60.7, 65.4])

### Sorting data

In [19]:
# %nl2pandas sort the data by 'TEMP'
data.sort_values(by='TEMP')

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS
8,20160609,49.4,54.1,45.7,8.4,3.7,9.666667
9,20160610,49.5,55.9,43.0,12.9,6.5,9.722222
4,20160605,51.4,58.3,43.2,15.1,8.2,10.777778
5,20160606,52.2,59.7,42.8,16.9,9.4,11.222222
10,20160611,54.0,62.1,41.7,20.4,12.3,12.222222
7,20160608,54.2,,47.5,,6.7,12.333333
11,20160612,55.4,64.2,46.0,18.2,9.4,13.0
18,20160619,56.3,59.2,54.1,5.1,2.2,13.5
6,20160607,56.9,65.1,45.9,19.2,11.0,13.833333
17,20160618,57.3,,54.0,,3.3,14.055556


In [20]:
# %nl2pandas sort data by 'TEMP'
data.sort_values(by='TEMP', ascending=False)

Unnamed: 0,YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_MIN,TEMP_CELSIUS
25,20160626,69.6,77.7,60.3,17.4,9.3,20.888889
2,20160603,68.4,,55.6,,12.8,20.222222
1,20160602,65.8,80.8,55.0,25.8,10.8,18.777778
28,20160629,65.8,73.2,,,,18.777778
29,20160630,65.7,72.7,59.2,13.5,6.5,18.722222
24,20160625,65.7,75.4,57.9,17.5,7.8,18.722222
0,20160601,65.5,73.6,54.7,18.9,10.8,18.611111
27,20160628,65.4,73.0,55.8,17.2,9.6,18.555556
14,20160615,63.4,70.3,49.3,21.0,14.1,17.444444
20,20160621,62.6,71.4,50.4,21.0,12.2,17.0


### Writing data to a file

In [21]:
# %nl2pandas save data as "Kumpula_temps_June_2016.csv"
data.to_csv(path_or_buf='Kumpula_temps_June_2016.csv')

In [22]:
# %nl2pandas save data as "Kumpula_temps_June_2016.csv"
warm_temps.to_csv(path_or_buf='Kumpula_temps_above15_June_2016.csv', float_format='%.1f', index=False)