In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import shutil
import os

In [2]:
shutil.unpack_archive('archive.zip')

In [3]:
#Here is the function that can take zip file and provide the 
#cleaned dataframe to do analysis.

def clean_archive_data(csv_file):
    """After the operation, csv file will 
    be deleted. 
    Function will take csv file as argument, and return 
    the dataframe ready for analysis"""

    honey_pot_data = pd.read_csv(csv_file)
    null_filled_data = honey_pot_data.copy()
    
    #filling the null values with unknown 
    for col in ['type','country','cc','locale','localeabbr','postalcode']:
        null_filled_data[col].fillna('unknown',axis=0,inplace=True)
    
    #filling the null values in case of floats with 0
    for col_name in ['spt','dpt']:
    #null_filled_data.fillna(0,inplace=True,axis=0)
        null_filled_data[col_name].fillna(0,inplace=True,axis=0)
    
    #dropping the column 
    null_filled_data.drop('Unnamed: 15',inplace=True,axis=1)
    
    #removing the rows that have any null values
    null_filled_data.dropna(axis=0,inplace=True)
    
    os.remove(csv_file)
    #returning the dataframe
    return null_filled_data

In [4]:
clean_data_frame = clean_archive_data('AWS_Honeypot_marx-geo.csv')

In [5]:
sample_data = clean_data_frame.sample(n=40000)

In [6]:
sample_data.shape

(40000, 15)

In [7]:
sample_data.to_csv("sample_data.csv")

In [5]:
!ls

archive.zip		      pandas_intro_data_cleaning.ipynb
Cleaned_data_forAnalysis.csv  pandas_intro_to_analysis.ipynb
cleaning_honeypot_script.py


In [51]:
clean_data_frame.head(2)

Unnamed: 0,datetime,host,src,proto,type,spt,dpt,srcstr,cc,country,locale,localeabbr,postalcode,latitude,longitude
0,3/3/13 21:53,groucho-oregon,1032051418,TCP,unknown,6000.0,1433.0,61.131.218.218,CN,China,Jiangxi Sheng,36,unknown,28.55,115.9333
1,3/3/13 21:57,groucho-oregon,1347834426,UDP,unknown,5270.0,5060.0,80.86.82.58,DE,Germany,unknown,unknown,unknown,51.0,9.0


In [6]:
clean_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 448112 entries, 0 to 451580
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   datetime    448112 non-null  object 
 1   host        448112 non-null  object 
 2   src         448112 non-null  int64  
 3   proto       448112 non-null  object 
 4   type        448112 non-null  object 
 5   spt         448112 non-null  float64
 6   dpt         448112 non-null  float64
 7   srcstr      448112 non-null  object 
 8   cc          448112 non-null  object 
 9   country     448112 non-null  object 
 10  locale      448112 non-null  object 
 11  localeabbr  448112 non-null  object 
 12  postalcode  448112 non-null  object 
 13  latitude    448112 non-null  float64
 14  longitude   448112 non-null  float64
dtypes: float64(4), int64(1), object(10)
memory usage: 54.7+ MB


In [52]:
clean_data_frame.datetime[0]

'3/3/13 21:53'

In [8]:
date_str = clean_data_frame.datetime[0]

In [11]:
date_str.split('/')[0]

'3'

In [12]:
date_str.split('/')[2]

'13 21:53'

In [14]:
date_str.split('/')[2].split(' ')[0]

'13'

In [9]:
#first course of action, convert the str datetime to Datetime
import datetime

date_obj = datetime.datetime.strptime(clean_data_frame.datetime[0],'%m/%d/%y %H:%M')

In [18]:
date_obj

datetime.datetime(2013, 3, 3, 21, 53)

In [15]:
date_obj.day

3

In [16]:
#This dataframe is useful when doing analysis, if main dataframe
#is corrupted
backup_df_obj = clean_data_frame.copy()

In [None]:
#Don't use this cell unless required
#clean_data_frame = backup_df_obj.copy()

In [27]:
#using the apply function to apply this to all the rows
clean_data_frame['datetime_obj'] = clean_data_frame.datetime.apply(
                        lambda x : datetime.datetime.strptime(x,'%m/%d/%y %H:%M'))

In [20]:
def conv_str_datetime(dt_str):
    return datetime.datetime.strptime(dt_str,'%m/%d/%y %H:%M')

In [21]:
import time

In [28]:
clean_data_frame.head(2)

Unnamed: 0,datetime,host,src,proto,type,spt,dpt,srcstr,cc,country,locale,localeabbr,postalcode,latitude,longitude,datetime_obj
0,3/3/13 21:53,groucho-oregon,1032051418,TCP,unknown,6000.0,1433.0,61.131.218.218,CN,China,Jiangxi Sheng,36,unknown,28.55,115.9333,2013-03-03 21:53:00
1,3/3/13 21:57,groucho-oregon,1347834426,UDP,unknown,5270.0,5060.0,80.86.82.58,DE,Germany,unknown,unknown,unknown,51.0,9.0,2013-03-03 21:57:00


In [23]:
start = time.time()
clean_data_frame.datetime.apply(conv_str_datetime)
end = time.time()

In [24]:
end - start

14.81672215461731

In [25]:
clean_data_frame[:5].datetime.apply(conv_str_datetime)

0   2013-03-03 21:53:00
1   2013-03-03 21:57:00
2   2013-03-03 21:58:00
3   2013-03-03 21:58:00
4   2013-03-03 21:58:00
Name: datetime, dtype: datetime64[ns]

In [29]:
clean_data_frame.datetime_obj[:5]

0   2013-03-03 21:53:00
1   2013-03-03 21:57:00
2   2013-03-03 21:58:00
3   2013-03-03 21:58:00
4   2013-03-03 21:58:00
Name: datetime_obj, dtype: datetime64[ns]

In [30]:
date_data = clean_data_frame.datetime_obj[0]

In [35]:
date_data.day_name()

'Sunday'

In [36]:
date_data.month_name()

'March'

In [37]:
date_data.weekofyear

9

In [38]:
date_data.hour

21

In [39]:
#bring two more columns, month and day of week
clean_data_frame['day_week'] = clean_data_frame.datetime_obj. \
                apply(lambda x: x.day_name())

clean_data_frame['month_name'] = clean_data_frame.datetime_obj. \
                apply(lambda x: x.month_name())

clean_data_frame['week_year'] = clean_data_frame.datetime_obj. \
                apply(lambda x: x.weekofyear)

clean_data_frame['incident_hour'] = clean_data_frame.datetime_obj. \
                apply(lambda x: x.hour)

In [40]:
#drop the str datetime column, not required
clean_data_frame.drop('datetime',axis=1,inplace=True)

In [41]:
clean_data_frame.head(5)

Unnamed: 0,host,src,proto,type,spt,dpt,srcstr,cc,country,locale,localeabbr,postalcode,latitude,longitude,datetime_obj,day_week,month_name,week_year,incident_hour
0,groucho-oregon,1032051418,TCP,unknown,6000.0,1433.0,61.131.218.218,CN,China,Jiangxi Sheng,36,unknown,28.55,115.9333,2013-03-03 21:53:00,Sunday,March,9,21
1,groucho-oregon,1347834426,UDP,unknown,5270.0,5060.0,80.86.82.58,DE,Germany,unknown,unknown,unknown,51.0,9.0,2013-03-03 21:57:00,Sunday,March,9,21
2,groucho-oregon,2947856490,TCP,unknown,2489.0,1080.0,175.180.184.106,TW,Taiwan,Taipei,unknown,unknown,25.0392,121.525,2013-03-03 21:58:00,Sunday,March,9,21
3,groucho-us-east,841842716,UDP,unknown,43235.0,1900.0,50.45.128.28,US,United States,Oregon,OR,97124,45.5848,-122.9117,2013-03-03 21:58:00,Sunday,March,9,21
4,groucho-singapore,3587648279,TCP,unknown,56577.0,80.0,213.215.43.23,FR,France,unknown,unknown,unknown,48.86,2.35,2013-03-03 21:58:00,Sunday,March,9,21


After all the cleaning and restructuring work, now the analysis 
starts. To start the analysis, you have to be curious and interested about the data. The data in this case is about the series of attacks done on the series of honeypots, inside a cloud service call aws. 

Which pot was targeted the most?

From where the attack came from, the ip?

Which country the attackers orginated?

Which port was targeted on each of the pots?

Which month, week, day, hour of the day the attack occured the most?

In [47]:
host_target = clean_data_frame.groupby('host')['srcstr'].agg('count')

In [45]:
host_target.head()

host
groucho-eu           23411
groucho-norcal       24005
groucho-oregon       93935
groucho-sa           24209
groucho-singapore    77663
Name: srcstr, dtype: int64

In [46]:
type(host_target)

pandas.core.series.Series

In [50]:
host_target = clean_data_frame.groupby('host').agg('count')
host_target.reset_index(inplace=True)
host_target

Unnamed: 0,host,src,proto,type,spt,dpt,srcstr,cc,country,locale,localeabbr,postalcode,latitude,longitude,datetime_obj,day_week,month_name,week_year,incident_hour
0,groucho-eu,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411,23411
1,groucho-norcal,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005,24005
2,groucho-oregon,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935,93935
3,groucho-sa,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209,24209
4,groucho-singapore,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663,77663
5,groucho-sydney,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372,24372
6,groucho-tokyo,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638,125638
7,groucho-us-east,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937,30937
8,zeppo-norcal,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942,23942


In [51]:
host_target[['host','src']]

Unnamed: 0,host,src
0,groucho-eu,23411
1,groucho-norcal,24005
2,groucho-oregon,93935
3,groucho-sa,24209
4,groucho-singapore,77663
5,groucho-sydney,24372
6,groucho-tokyo,125638
7,groucho-us-east,30937
8,zeppo-norcal,23942


In [52]:
#All the above questions can be answered by using groupby clause

host_target = clean_data_frame.groupby('host').agg('count')
host_target.reset_index(inplace=True)
host_target = host_target[['host','src']]
host_target.head(10)

Unnamed: 0,host,src
0,groucho-eu,23411
1,groucho-norcal,24005
2,groucho-oregon,93935
3,groucho-sa,24209
4,groucho-singapore,77663
5,groucho-sydney,24372
6,groucho-tokyo,125638
7,groucho-us-east,30937
8,zeppo-norcal,23942


In [53]:
#create a function that returns the aggregated data
def return_agg_column(datafrm, column_name):
    grp_df = datafrm.groupby(column_name).agg('count')
    grp_df.reset_index(inplace=True)
    grp_df.sort_values(by='src',ascending=False,inplace=True)
    return grp_df[[column_name,'src']]

In [54]:
#create a function that returns the aggregated data
def agg_two_column(datafrm, column_one,column_two):
    grp_df = datafrm.groupby([column_one,column_two]).agg('count')
    grp_df.reset_index(inplace=True)
    grp_df.sort_values(by='src',ascending=False,inplace=True)
    return grp_df[[column_one,column_two,'src']]

In [55]:
host_tgted_data = return_agg_column(datafrm=clean_data_frame,
                                   column_name='host')
host_tgted_data.head(5)

Unnamed: 0,host,src
6,groucho-tokyo,125638
2,groucho-oregon,93935
4,groucho-singapore,77663
7,groucho-us-east,30937
5,groucho-sydney,24372


Top 5 targeted pots are shown above. How many pots are there?

In [94]:
host_tgted_data.shape[0]

9

In [95]:
host_tgted_data

Unnamed: 0,host,src
6,groucho-tokyo,125638
2,groucho-oregon,93935
4,groucho-singapore,77663
7,groucho-us-east,30937
5,groucho-sydney,24372
3,groucho-sa,24209
1,groucho-norcal,24005
8,zeppo-norcal,23942
0,groucho-eu,23411


In [56]:
port_tgted_data = return_agg_column(datafrm=clean_data_frame,
                                   column_name='dpt')
port_tgted_data.head(5)

Unnamed: 0,dpt,src
542,1433.0,109397
0,0.0,44444
76,445.0,40077
805,3389.0,29990
26,80.0,19511


In [57]:
#How many ports are targeted. That is a lot of ports... 
port_tgted_data.shape[0]

4034

In [58]:
host_port_tgted_data = agg_two_column(datafrm=clean_data_frame,
                                      column_one='host',
                                   column_two='dpt')
host_port_tgted_data.head(5)

Unnamed: 0,host,dpt,src
2564,groucho-oregon,1433.0,58997
6248,groucho-tokyo,445.0,31085
4424,groucho-singapore,1433.0,20388
6688,groucho-tokyo,56338.0,18195
6210,groucho-tokyo,0.0,16089


In [87]:
#Run a google search, you will find the 1433 is db server port
#port 445 is MS Active Directory port. Very important ports

In [89]:
src_ip_dst_pot = agg_two_column(datafrm=clean_data_frame,
                               column_one='srcstr',column_two='host')
src_ip_dst_pot.head(5)

Unnamed: 0,srcstr,host,src
37101,175.146.199.252,groucho-tokyo,18472
55275,2.186.189.218,groucho-tokyo,11116
59002,203.178.148.19,groucho-tokyo,4384
30848,128.9.168.98,groucho-tokyo,4330
30935,129.82.138.44,groucho-tokyo,4130


In [91]:
country_dst_pot = agg_two_column(datafrm=clean_data_frame,
                                column_one='country',
                                 column_two='host')
country_dst_pot.head(5)

Unnamed: 0,country,host,src
214,China,groucho-oregon,56533
218,China,groucho-tokyo,46518
216,China,groucho-singapore,38528
1080,United States,groucho-oregon,22040
1084,United States,groucho-tokyo,18891


In [99]:
day_hour_attack = agg_two_column(datafrm=clean_data_frame,
               column_one='day_week',column_two='incident_hour')
day_hour_attack.head(5)

Unnamed: 0,day_week,incident_hour,src
151,Wednesday,7,13889
120,Tuesday,0,12050
47,Monday,23,10203
67,Saturday,19,4865
42,Monday,18,3241


In [103]:
#lets learn about filtering the data...
day_hour_attack.filter(items=['day_week','incident_hour'])

Unnamed: 0,day_week,incident_hour
151,Wednesday,7
120,Tuesday,0
47,Monday,23
67,Saturday,19
42,Monday,18
...,...,...
21,Friday,21
93,Sunday,21
119,Thursday,23
22,Friday,22


# Alternate to Filter method

In [110]:
host_tgted_data[host_tgted_data.src > 50000]

Unnamed: 0,host,src
6,groucho-tokyo,125638
2,groucho-oregon,93935
4,groucho-singapore,77663


In [112]:
host_port_tgted_data[host_port_tgted_data.host == 'groucho-oregon']

Unnamed: 0,host,dpt,src
2564,groucho-oregon,1433.0,58997
2606,groucho-oregon,3306.0,5752
2611,groucho-oregon,3389.0,4821
2491,groucho-oregon,0.0,2135
2497,groucho-oregon,22.0,1952
...,...,...,...
2681,groucho-oregon,5091.0,1
2683,groucho-oregon,5093.0,1
2684,groucho-oregon,5094.0,1
2685,groucho-oregon,5095.0,1


In [114]:
# we want to infer which port is which application
port_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers")

In [115]:
len(port_data)

7

In [118]:
port_data[3]

Unnamed: 0,Cell,Description
0,Yes,Described protocol is assigned by IANA for thi...
1,Unofficial,Described protocol is not assigned by IANA for...
2,Assigned,Described protocol is assigned by IANA for thi...
3,No,Described protocol is not: assigned by IANA fo...
4,Reserved,"Port is reserved by IANA,[2] generally to prev..."


In [129]:
port_low = port_data[4][['Port','Description']]
port_high = port_data[5][['Port','Description']]
port_all = pd.concat([port_low,port_high])
port_all.shape

(1401, 2)

In [140]:
'-' in '10000-10500'

True

In [157]:
def split_ports(port_str):
    port_str.strip()
    if '-' in port_str:
        port_1 = port_str.split('-')[0]
        port_2 = port_str.split('-')[1]
        return [int(port_1),int(port_2)]
    else:
        return int(port_str)

In [158]:
split_ports('8-100000')

[8, 100000]

In [193]:
single_ports = port_all[~port_all.Port.str.contains('–')]

In [194]:
single_ports = single_ports[~single_ports.Port.str.contains('-')]

In [187]:
single_ports

Unnamed: 0,Port,Description
0,0,In programming APIs (not in communication betw...
1,1,TCP Port Service Multiplexer (TCPMUX). Histori...
856,10000,"Webmin, Web-based Unix/Linux system administra..."
855,10000,BackupExec
854,10000,Network Data Management Protocol (NDMP) Contro...
...,...,...
849,9987,TeamSpeak 3 server default (voice) port (for t...
850,9993,ZeroTier Default port for ZeroTier
851,9997,Splunk port for communication between the forw...
852,9999,Urchin Web Analytics[citation needed]


In [195]:
def try_typecast(integ):
    try:
        port_temp = integ.strip()
        #print(port_temp)
        port = int(integ)
        return port 
    except:
        port = 0

In [196]:
for triKport in single_ports.Port:
    try_typecast(triKport)

In [197]:
single_ports.Port = single_ports.Port.apply(lambda x: try_typecast(x))

In [198]:
single_ports.sort_values(by='Port',ascending=False)

Unnamed: 0,Port,Description
1108,49151.0,Reserved[2]
1107,48556.0,drive.web AC/DC Drive Automation and Control N...
1105,44818.0,EtherNet/IP explicit messaging
1104,44405.0,Mu Online Connect Server[citation needed]
1102,43110.0,ZeroNet web UI default port [411]
...,...,...
1,1.0,TCP Port Service Multiplexer (TCPMUX). Histori...
0,0.0,In programming APIs (not in communication betw...
163,,Authenticated SMTP[11] over TLS/SSL (SMTPS) (a...
161,,SMTP over implicit SSL (obsolete)[86]


In [200]:
single_ports[single_ports.Port == 445]

Unnamed: 0,Port,Description
158,445.0,Microsoft-DS (Directory Services) Active Direc...
159,445.0,Microsoft-DS (Directory Services) SMB[11] file...


In [204]:
top_ports = list(port_tgted_data.dpt[:10])

In [205]:
top_ports

[1433.0, 0.0, 445.0, 3389.0, 80.0, 56338.0, 8080.0, 22.0, 3306.0, 2193.0]

In [207]:
top_port_description = single_ports[single_ports.Port.isin(top_ports)]

In [208]:
top_port_description

Unnamed: 0,Port,Description
0,0.0,In programming APIs (not in communication betw...
43,1433.0,Microsoft SQL Server database management syste...
16,22.0,"Secure Shell (SSH),[11] secure logins, file tr..."
222,3306.0,MySQL database system[11]
231,3389.0,Microsoft Terminal Server (RDP) officially reg...
158,445.0,Microsoft-DS (Directory Services) Active Direc...
159,445.0,Microsoft-DS (Directory Services) SMB[11] file...
58,80.0,Hypertext Transfer Protocol (HTTP)[48][49] use...
685,8080.0,Atlassian JIRA applications[287]
683,8080.0,Alternative port for HTTP. See also ports 80 a...


In [215]:
#lets bring the dataframes together
merged_data = pd.merge(left,left_on='dpt',
         right=t=port_tgted_dataop_port_description,right_on='Port',how='inner')
merged_data.drop('dpt',inplace=True,axis=1)
merged_data.columns = ['Count','Port','Application']

In [216]:
merged_data

Unnamed: 0,Count,Port,Application
0,109397,1433.0,Microsoft SQL Server database management syste...
1,44444,0.0,In programming APIs (not in communication betw...
2,40077,445.0,Microsoft-DS (Directory Services) Active Direc...
3,40077,445.0,Microsoft-DS (Directory Services) SMB[11] file...
4,29990,3389.0,Microsoft Terminal Server (RDP) officially reg...
5,19511,80.0,Hypertext Transfer Protocol (HTTP)[48][49] use...
6,15390,8080.0,Atlassian JIRA applications[287]
7,15390,8080.0,Alternative port for HTTP. See also ports 80 a...
8,15390,8080.0,Apache Tomcat[286]
9,15070,22.0,"Secure Shell (SSH),[11] secure logins, file tr..."


In [225]:
new_grp = clean_data_frame.groupby(["dpt","host"]).count()

In [226]:
df_grp = clean_data_frame.groupby(["dpt","host"]).aggregate('count')

In [227]:
type(new_grp)

pandas.core.frame.DataFrame

In [228]:
type(df_grp)

pandas.core.frame.DataFrame

In [234]:
df_grp[["src","proto"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,src,proto
dpt,host,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,groucho-eu,2159,2159
0.0,groucho-norcal,3299,3299
0.0,groucho-oregon,2135,2135
0.0,groucho-sa,2767,2767
0.0,groucho-singapore,10958,10958
...,...,...,...
65500.0,groucho-singapore,225,225
65500.0,groucho-sydney,27,27
65500.0,groucho-tokyo,427,427
65500.0,groucho-us-east,53,53


In [235]:
df_grp[["src","proto"]].stack()

dpt      host                  
0.0      groucho-eu       src      2159
                          proto    2159
         groucho-norcal   src      3299
                          proto    3299
         groucho-oregon   src      2135
                                   ... 
65500.0  groucho-tokyo    proto     427
         groucho-us-east  src        53
                          proto      53
         zeppo-norcal     src        31
                          proto      31
Length: 18804, dtype: int64

In [236]:
df_grp[["src","proto"]].unstack()

Unnamed: 0_level_0,src,src,src,src,src,src,src,src,src,proto,proto,proto,proto,proto,proto,proto,proto,proto
host,groucho-eu,groucho-norcal,groucho-oregon,groucho-sa,groucho-singapore,groucho-sydney,groucho-tokyo,groucho-us-east,zeppo-norcal,groucho-eu,groucho-norcal,groucho-oregon,groucho-sa,groucho-singapore,groucho-sydney,groucho-tokyo,groucho-us-east,zeppo-norcal
dpt,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0.0,2159.0,3299.0,2135.0,2767.0,10958.0,2784.0,16089.0,2193.0,2060.0,2159.0,3299.0,2135.0,2767.0,10958.0,2784.0,16089.0,2193.0,2060.0
1.0,1.0,1.0,2.0,3.0,3.0,1.0,,3.0,3.0,1.0,1.0,2.0,3.0,3.0,1.0,,3.0,3.0
2.0,1.0,,4.0,1.0,1.0,2.0,1.0,,,1.0,,4.0,1.0,1.0,2.0,1.0,,
3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0
7.0,,,,,,2.0,,,,,,,,,2.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65263.0,,1.0,,,,,,,,,1.0,,,,,,,
65274.0,,,,1.0,,,,,,,,,1.0,,,,,
65293.0,,,,,,,,,1.0,,,,,,,,,1.0
65301.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [237]:
df_grp.reset_index(inplace=True)
df_grp = df_grp[['dpt','src']]
df_grp

Unnamed: 0,dpt,src
0,0.0,2159
1,0.0,3299
2,0.0,2135
3,0.0,2767
4,0.0,10958
...,...,...
9397,65500.0,225
9398,65500.0,27
9399,65500.0,427
9400,65500.0,53


In [238]:
merged_data.columns

Index(['Count', 'Port', 'Application'], dtype='object')

In [240]:
pivot_merge = merged_data.pivot(index='Port',columns='Application',values='Count')
pivot_merge.fillna(0,inplace=True)
pivot_merge

Application,Alternative port for HTTP. See also ports 80 and 8008.,Apache Tomcat[286],Atlassian JIRA applications[287],"Hypertext Transfer Protocol (HTTP)[48][49] uses TCP in versions 1.x and 2. HTTP/3 uses QUIC,[50] a transport protocol on top of UDP.","In programming APIs (not in communication between hosts), requests a system-allocated (dynamic) port[6]",Microsoft SQL Server database management system (MSSQL) server,Microsoft Terminal Server (RDP) officially registered as Windows Based Terminal (WBT)[188],"Microsoft-DS (Directory Services) Active Directory,[85] Windows shares",Microsoft-DS (Directory Services) SMB[11] file sharing,MySQL database system[11],"Secure Shell (SSH),[11] secure logins, file transfers (scp, sftp) and port forwarding"
Port,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0.0,0.0,0.0,0.0,44444.0,0.0,0.0,0.0,0.0,0.0,0.0
22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15070.0
80.0,0.0,0.0,0.0,19511.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
445.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40077.0,40077.0,0.0,0.0
1433.0,0.0,0.0,0.0,0.0,0.0,109397.0,0.0,0.0,0.0,0.0,0.0
3306.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14511.0,0.0
3389.0,0.0,0.0,0.0,0.0,0.0,0.0,29990.0,0.0,0.0,0.0,0.0
8080.0,15390.0,15390.0,15390.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [250]:
pivot_data = pd.pivot_table(data=clean_data_frame,
              index=['srcstr','dpt','country'],
              columns='host',values='src',
              aggfunc='count')
pivot_data.reset_index(inplace=True)
pivot_data.fillna(value=0,inplace=True)

In [255]:
pivot_data.head(2)

host,srcstr,dpt,country,groucho-eu,groucho-norcal,groucho-oregon,groucho-sa,groucho-singapore,groucho-sydney,groucho-tokyo,groucho-us-east,zeppo-norcal
0,1.0.0.38,445.0,Australia,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1,1.1.162.110,3544.0,Thailand,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [264]:
pivot_data.loc[pivot_data['groucho-eu'] > 0,['srcstr','groucho-eu']]

host,srcstr,groucho-eu
25,1.160.38.18,1.0
29,1.160.68.234,1.0
30,1.160.68.234,1.0
31,1.160.68.234,1.0
33,1.161.137.125,1.0
...,...,...
85452,98.228.224.216,5.0
85457,98.231.8.166,1.0
85462,98.243.160.139,1.0
85503,99.16.218.201,15.0


In [256]:
time_series_data = pd.date_range(start='20220101',periods=10,freq='m')
time_series_data

DatetimeIndex(['2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31'],
              dtype='datetime64[ns]', freq='M')

In [257]:
day_series_data = pd.date_range(start='20220101',periods=10,freq='d')
day_series_data

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10'],
              dtype='datetime64[ns]', freq='D')

In [258]:
hr_series_data = pd.date_range(start='20220101',periods=10,freq='H')
hr_series_data

DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 01:00:00',
               '2022-01-01 02:00:00', '2022-01-01 03:00:00',
               '2022-01-01 04:00:00', '2022-01-01 05:00:00',
               '2022-01-01 06:00:00', '2022-01-01 07:00:00',
               '2022-01-01 08:00:00', '2022-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')