# Análise de mensagens spam: dias com mais mensagens

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

## Dataset contendo apenas os dados necessários para o cálculo

In [2]:
data = pd.read_csv('sms_senior.csv', delimiter=',', encoding='mbcs')[['Date', 'IsSpam']]

In [3]:
data

Unnamed: 0,Date,IsSpam
0,2017-01-01 00:08:00,no
1,2017-01-01 00:19:00,no
2,2017-01-01 01:53:00,yes
3,2017-01-01 02:14:00,no
4,2017-01-01 02:25:00,no
...,...,...
5569,2017-03-31 21:13:00,no
5570,2017-03-31 22:19:00,no
5571,2017-03-31 23:15:00,yes
5572,2017-03-31 23:32:00,no


### Transaformação da data para Ano-Mês e spam para booleano

In [4]:
data['Date'] = data['Date'].map(lambda c: pd.to_datetime(c)).map(lambda x: x.date())

In [5]:
data['IsSpam'] = data['IsSpam'].map(lambda x: True if x == 'no' else False)

In [6]:
data

Unnamed: 0,Date,IsSpam
0,2017-01-01,True
1,2017-01-01,True
2,2017-01-01,False
3,2017-01-01,True
4,2017-01-01,True
...,...,...
5569,2017-03-31,True
5570,2017-03-31,True
5571,2017-03-31,False
5572,2017-03-31,True


### Sequência máxima de mensagens comuns por dia

In [13]:
def agg_max_sequence_true(arr):
    a = np.diff(np.where(np.concatenate(([arr[0]],
                                         arr[:-1] != arr[1:],
                                         [True])))[0])[::2]
    return max(a)

In [14]:
max_seq = data.groupby(['Date'])['IsSpam'].agg(max_seq=('max_seq', lambda x: agg_max_sequence_true(x.array))).reset_index()

In [15]:
max_seq

Unnamed: 0,Date,max_seq
0,2017-01-01,22
1,2017-01-02,21
2,2017-01-03,18
3,2017-01-04,14
4,2017-01-05,11
...,...,...
85,2017-03-27,12
86,2017-03-28,22
87,2017-03-29,13
88,2017-03-30,13


### Dia com a maior sequência de mensagens comuns por mês

In [10]:
def agg_max_day_in_month(df):
    df['YearMonth'] = df['Date'].map(lambda c: pd.to_datetime(c)).map(lambda x: 100 * x.year + x.month)
    return df.loc[df.groupby("YearMonth")["max_seq"].idxmax()]

In [11]:
max_day = agg_max_day_in_month(max_seq)

In [12]:
max_day[['YearMonth', 'Date', 'max_seq']].set_index('YearMonth')

Unnamed: 0_level_0,Date,max_seq
YearMonth,Unnamed: 1_level_1,Unnamed: 2_level_1
201701,2017-01-26,31
201702,2017-02-04,39
201703,2017-03-31,46
