<h2>Basic Analytic Functions Operations</h2>
<p>
This Python Notebook provides some basic workflow operations for the Teradata analytic functions library. It will use the plots, species, and surveys datasets created
when running the PythonBasicAnalytics-Setup SQL notebook.</p>

<i>NOTE: You must have a connection to Teradata Vantage that has the Teradata analytic functions installed.</i>
    


<h3>Get the list of Teradata analytic functions</h3>

In [None]:
help("teradataml")

<h3>Import statements</h3>

In [None]:
from teradataml.context.context import create_context, remove_context, get_context
from teradataml.dataframe.dataframe import DataFrame
from teradataml.dataframe.copy_to import copy_to_sql
from teradataml.options.display import display
import numpy as np

<h3>Create a connection</h3>

In [None]:
# Replace your cluster details for user, passwd and host
user = "xxxxx"
passwd = "xxxxx"
host = "xxxxx"
eng = create_context(host = host, username = user, password = passwd)
conn = eng.connect()
print(eng)
print(conn)

<h3>Create Teradata DataFrames</h3>

<i>Insert the tables' data using the Teradata SQL notebooks provided (PythonBasicAnalytics-Cleanup.ipynb and PythonBasicAnalytics-Setup.ipynb) before executing the next cell.</i>

In [None]:
plots_data = DataFrame('plots')
species_data = DataFrame('species')
surveys_data = DataFrame('surveys')

In [None]:
plots_data.head(5)

In [None]:
species_data.head(5)

In [None]:
surveys_data.head(5)

In [None]:
# Function to verify whether the number of rows of a TeradataML dataframe equals 'value'
def assertLength(df, value):
    try:
        assert(df.shape[0] == value)
    except AssertionError:
        raise

## Verifying the correctness of initial data setup

In [None]:
assertLength(plots_data, 24)
assertLength(species_data, 54)
assertLength(surveys_data, 1236)

In [None]:
sorted_surveys_data = surveys_data.sort('record_id')
sorted_surveys_data.head(10)

In [None]:
# Asserting some values of the survey data
pandas_data = sorted_surveys_data.to_pandas()

assert(pandas_data.index.name == "record_id")
assert(list(pandas_data.index)[0:10] == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] )
assert(list(pandas_data['plot_id'][0:10]) == [2, 3, 2, 7, 3, 1, 2, 1, 1, 6] )
assert(list(pandas_data['year'][0:10]) == [1977]*10 )
assert(list(pandas_data['month'][0:2]) == [7]*2 )
assert(list(pandas_data['day'][0:10]) == [16]*10 )
assert(list(pandas_data['species_id'][0:10]) == ['NL  ','NL  ','DM  ','DM  ','DM  ','PF  ','PE  ','DM  ','DM  ','PF  '])
assert(list(pandas_data['hindfoot_length'][0:6]) == [32.0, 33.0, 37.0, 36.0, 35.0, 14.0])

## Subsetting, sorting, transformation, joining of Teradata DataFrames

In [None]:
# extract the survey observations for the first three months of 1990 with filter
surveys1990_winter = surveys_data[(surveys_data.year == 1990) & ((surveys_data.month == 1) | 
                                                                 (surveys_data.month == 2) | 
                                                                 (surveys_data.month == 3))]

surveys1990_winter

In [None]:
assertLength(surveys1990_winter, 16)

In [None]:
# Remove redundant 'year' column - as all the rows have same value '1990'
surveys1990_winter = surveys1990_winter.drop(['year'], axis = 1)
surveys1990_winter.head()

In [None]:
assert('year' not in surveys1990_winter.columns) # Verifying whether 'year' is in list of columns

In [None]:
# sort the 1990 winter surveys data by descending order of species name, 
# then by ascending order of weight. 
sorted_survey1990 = surveys1990_winter.sort(columns=['species_id', 'weight'], ascending=[False, True])
sorted_survey1990

In [None]:
# Count the number of individuals by species observed in the winter of 1990. 
# First define a grouping of our surveys1990_winter teradataml DataFrame with group_by, 
# then call count() to count the rows under each species.

count_1990w = surveys1990_winter.groupby('species_id').count().select(['species_id', 'count_record_id'])
count_1990w = count_1990w.assign(count1 = count_1990w.count_record_id)
count_1990w = count_1990w.drop(['count_record_id'], axis=1)
count_1990w

In [None]:
assertLength(count_1990w, 8)

In [None]:
# Sum of all count1 column values will give total samples (tot_cnt)
total_count = count_1990w.select('count1').sum()
tot_cnt = total_count.to_pandas()['sum_count1'][0]
tot_cnt

In [None]:
surveys1990_winter.shape

In [None]:
# we use the previously obtained count variable to derive the PROPORTION of individuals represented by
# each species, and assign the result to a new 'prop' column.
count_1990w1 = count_1990w.assign(prop = count_1990w.count1 / float(tot_cnt))
count_1990w1

In [None]:
count_1990w1.shape

In [None]:
count_1990w1.dtypes # Type of data in every column

In [None]:
# 0.9999999999999998
assert(round(count_1990w1.select('prop').sum().to_pandas()['sum_prop'][0]) == 1)

In [None]:
# Inner join of DataFrames to get all species data along with count1 and prop.
counts_1990w_join_inner = count_1990w1.join(species_data, on = 'species_id', how = "inner", lsuffix = 't1', rsuffix = 't2')
counts_1990w_join_inner = counts_1990w_join_inner.assign(species_id = counts_1990w_join_inner.t1_species_id)
counts_1990w_join_inner = counts_1990w_join_inner.drop(['t1_species_id', 't2_species_id'], axis = 1)
counts_1990w_join_inner

In [None]:
counts_1990w_join_inner.shape

In [None]:
counts_1990w_join_inner.to_pandas()

In [None]:
# Asserting the columns in counts_1990w_join_inner
expected_columns = list(set(species_data.columns + count_1990w1.columns))
assert(len(counts_1990w_join_inner.columns) == len(expected_columns))
assert(all(col in expected_columns for col in counts_1990w_join_inner.columns))

In [None]:
display.max_rows = 20

In [None]:
print(count_1990w1.shape)
print(count_1990w1.columns)

In [None]:
print(species_data.shape)
print(species_data.columns)

In [None]:
list(set(species_data.columns + count_1990w1.columns))

In [None]:
# Left join of DataFrames to get all species data along with count1 and prop.
counts_1990w_join_left = count_1990w1.join(species_data, on = 'species_id', how = "left", lsuffix = 't1', rsuffix = 't2')
counts_1990w_join_left = counts_1990w_join_left.assign(species_id = counts_1990w_join_left.t1_species_id)
counts_1990w_join_left = counts_1990w_join_left.drop(['t1_species_id', 't2_species_id'], axis = 1)
print(counts_1990w_join_left.shape)
counts_1990w_join_left # Observe one species with " " species_id - which is not present in counts_1990w_join_inner

In [None]:
counts_1990w_join_left.sum().to_pandas() # sum_count1 should be 16 (total samples) and sum_prop should be 1 (adding up all proportions)

In [None]:
assert(counts_1990w_join_left.sum().to_pandas()['sum_count1'][0] == 16)
assert(counts_1990w_join_left.sum().to_pandas()['sum_prop'][0] == 1.0)

In [None]:
list(counts_1990w_join_left.select('genus').to_pandas()['genus'])

In [None]:
# Get max value of every column grouped by genus
counts_1990w_join_left.groupby('genus').max()

In [None]:
assertLength(counts_1990w_join_left.groupby('genus').max(), 7)

In [None]:
# Getting count of samples under each taxa
taxa_grouby_sum = counts_1990w_join_left.groupby('taxa').sum().select(['taxa', 'sum_count1'])
pd_taxa_groupby_sum = taxa_grouby_sum.to_pandas()
pd_taxa_groupby_sum

# Calculate the fraction of total counts by taxa (birds or rodents) represented by each species within that taxon.

In [None]:
taxa_groupby_join_counts = counts_1990w_join_left.join(taxa_grouby_sum, how = 'right', on = 'taxa', 
                                       lsuffix= 'cnt', rsuffix= 'x')
taxa_groupby_join_counts

In [None]:
taxa_groupby_join_counts = taxa_groupby_join_counts.drop(['cnt_taxa'], axis=1) # Dropping repeated column
taxa_groupby_join_counts 

In [None]:
taxa_groupby_prop_counts = taxa_groupby_join_counts.assign(
                                    prop = taxa_groupby_join_counts.count1 / taxa_groupby_join_counts.sum_count1)
# 'prop' column will have zeros as of now as every value lies between 0 and 1
taxa_groupby_prop_counts

In [None]:
# The features of all surveyed individuals of Reithrodontomys montanus (RO).

surveys_RO = surveys_data[surveys_data.species_id == 'RO']
surveys_RO

In [None]:
# Find the average weight and hindfoot length of Dipodomys merriami (DM) individuals 
# observed in each month (irrespective of the year). Make sure to exclude NA values.

In [None]:
surveys_dm = surveys_data[surveys_data.species_id == 'DM']
surveys_dm

In [None]:
surveys_dm.to_pandas().shape

In [None]:
survey_monthwise = surveys_dm.groupby('month').agg({'weight' : 'mean', 'hindfoot_length' : 'mean'})

In [None]:
assertLength(survey_monthwise, 12)

In [None]:
expected_columns = ['month', 'mean_weight', 'mean_hindfoot_length']
assert(len(survey_monthwise.columns) == len(expected_columns))
assert(all(col in expected_columns for col in survey_monthwise.columns))

In [None]:
remove_context()

<span style="font-size:16px;">For more information on the Teradata analytic functions, refer to the [Teradata Documentation](https://docs.teradata.com/) and search for Teradata Package for Python.</span>

Copyright 2019-2022 Teradata. All rights reserved.