In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')

pd.set_option('display.max_colwidth', None)

In [14]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)    # Show all rows
pd.set_option('display.max_colwidth', None) # Allow wrapping for long values

# Presidio
In this notebook, I have applied the Presidio to various log data from different sources to showcase the functionality of Presidio. Nearly all URL, Phone numbers and Names are anonymized using the package. For anonymization of other textual matter we can use NLP techniques like masking and vector embeddings.

In [15]:
!pip install presidio-analyzer presidio-anonymizer

[0m

In [16]:
import re
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

In [17]:
analyzer = AnalyzerEngine()       # Analyzer
anonymizer = AnonymizerEngine()   # Anonymizer

In [18]:
def analyze_and_store_results(df, analyzer):
    """
    Analyze text in each cell of the DataFrame and store the analysis results.

    Args:
        df (DataFrame): DataFrame containing text data to be analyzed.
        analyzer: Presidio Analyzer instance.

    Returns:
        DataFrame: DataFrame containing the analysis results.
    """
    results_list = []
    for index, row in df.iterrows():
        row_results = []
        for col in df.columns:
            try:
              # Analyze text in each cell and store the analysis output

                analyzer_result = analyzer.analyze(text=row[col],
                                                   entities=['PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'URL'],
                                                   language='en',
                                                   return_decision_process=True)
                row_results.append(analyzer_result)  # Store the entire analysis output for flexibility
            except Exception as e:
                print(f"Error analyzing row {index}, column {col}: {e}")
                # Handle errors as needed (e.g., store a placeholder value, log the error, etc.)
        results_list.append(row_results)
    
    # Create a DataFrame from the analysis results
    results_df = pd.DataFrame(results_list, columns=df.columns)
    return results_df

In [19]:
def process_cell(text, analyzer_results):
    """
    Process text in a cell based on Presidio Analyzer results.

    Args:
        text (str): Text to be processed.
        analyzer_results (list): List of Presidio Analyzer results.

    Returns:
        tuple: Processed text and analyzer output.
    """
    # Extract text and positions from Presidio Analyzer results
    analyzer_output = [(text[res.start:res.end], res.start, res.end) for res in analyzer_results]
    
    # Anonymize text using Presidio Anonymizer
    anonymized_result = anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
    
    # Replace original text with anonymized text and highlight anonymized entities
    replaced_text = re.sub(r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", anonymized_result.text)
    return replaced_text, analyzer_output

## Webserver Logs

In [20]:
data = pd.read_csv('../input/webserverlogs10k/web-server-access-logs_10k.log', sep='- -', header=None)
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(10001, 2)
Index(['A', 'B'], dtype='object')


Unnamed: 0,A,B
0,",""54.36.149.41","[22/Jan/2019:03:56:14 +0330] """"GET /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,27|%DA%A9%D9%85%D8%AA%D8%B1%20%D8%A7%D8%B2%205%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1"""" 200 30577 """"-"""" """"Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"""" """"-"""""""
1,"3089189,""5.210.140.170","[23/Jan/2019:12:43:40 +0330] """"GET /apple-touch-icon-120x120.png HTTP/1.1"""" 404 33679 """"-"""" """"MobileSafari/604.1 CFNetwork/976 Darwin/18.2.0"""" """"-"""""""
2,"6073343,""66.249.66.91","[24/Jan/2019:21:17:43 +0330] """"GET /static/images/guarantees/bestPrice.png HTTP/1.1"""" 304 0 """"-"""" """"Googlebot-Image/1.0"""" """"-"""""""
3,"9392152,""5.200.69.130","[26/Jan/2019:13:54:06 +0330] """"GET /image/1/brand HTTP/1.1"""" 200 3924 """"https://www.zanbil.ir/filter/b1,p62"""" """"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"""" """"-"""""""
4,"4366289,""5.208.174.46","[23/Jan/2019:22:52:04 +0330] """"GET /image/1221/mainSlideMobile HTTP/1.1"""" 200 68387 """"https://www.zanbil.ir/m/browse/cell-phone/%DA%AF%D9%88%D8%B4%DB%8C-%D9%85%D9%88%D8%A8%D8%A7%DB%8C%D9%84"""" """"Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-N910C Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/8.2 Chrome/63.0.3239.111 Mobile Safari/537.36"""" """"-"""""""


### Anonymized

In [21]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B
0,[],"[type: URL, start: 273, end: 297, score: 0.6]"
1,[],"[type: URL, start: 37, end: 64, score: 0.5]"
2,[],"[type: PERSON, start: 5, end: 22, score: 0.85, type: URL, start: 62, end: 74, score: 0.5]"
3,[],"[type: PERSON, start: 155, end: 173, score: 0.85, type: URL, start: 73, end: 110, score: 0.6]"
4,[],"[type: URL, start: 87, end: 192, score: 0.6, type: PHONE_NUMBER, start: 326, end: 339, score: 0.4]"


In [22]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B
0,",""54.36.149.41","[22/Jan/2019:03:56:14 +0330] """"GET /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,27|%DA%A9%D9%85%D8%AA%D8%B1%20%D8%A7%D8%B2%205%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1"""" 200 30577 """"-"""" """"Mozilla/5.0 (compatible; AhrefsBot/6.1; +[31m<URL>[0m)"""" """"-"""""""
1,"3089189,""5.210.140.170","[23/Jan/2019:12:43:40 +0330] """"GET /[31m<URL>[0mg HTTP/1.1"""" 404 33679 """"-"""" """"MobileSafari/604.1 CFNetwork/976 Darwin/18.2.0"""" """"-"""""""
2,"6073343,""66.249.66.91","[24/[31m<PERSON>[0m +0330] """"GET /static/images/guarantees/[31m<URL>[0mg HTTP/1.1"""" 304 0 """"-"""" """"Googlebot-Image/1.0"""" """"-"""""""
3,"9392152,""5.200.69.130","[26/Jan/2019:13:54:06 +0330] """"GET /image/1/brand HTTP/1.1"""" 200 3924 """"[31m<URL>[0m """"Mozilla/5.0 (Windows NT 10.0; Win64; x64) [31m<PERSON>[0m (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"""" """"-"""""""
4,"4366289,""5.208.174.46","[23/Jan/2019:22:52:04 +0330] """"GET /image/1221/mainSlideMobile HTTP/1.1"""" 200 68387 """"[31m<URL>[0m """"Mozilla/5.0 (Linux; Android 6.0.1; SAMSUNG SM-N910C Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/8.2 Chrome/[31m<PHONE_NUMBER>[0m Mobile Safari/537.36"""" """"-"""""""


## Web Logs

In [23]:
data = pd.read_csv('../input/web-log-dataset/weblog.csv')
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(16007, 4)
Index(['A', 'B', 'C', 'D'], dtype='object')


Unnamed: 0,A,B,C,D
0,10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1,200
1,10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1,302
2,10.128.2.1,[29/Nov/2017:06:59:03,GET /home.php HTTP/1.1,200
3,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js HTTP/1.1,200
4,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1,200


In [24]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B,C,D
0,[],[],"[type: URL, start: 5, end: 13, score: 0.5]",[]
1,[],[],"[type: URL, start: 6, end: 16, score: 0.5]",[]
2,[],[],"[type: URL, start: 5, end: 12, score: 0.5]",[]
3,[],[],[],[]
4,[],[],[],[]


In [25]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B,C,D
0,10.128.2.1,[29/Nov/2017:06:58:55,GET /[31m<URL>[0mp HTTP/1.1,200
1,10.128.2.1,[29/Nov/2017:06:59:02,POST /[31m<URL>[0mp HTTP/1.1,302
2,10.128.2.1,[29/Nov/2017:06:59:03,GET /[31m<URL>[0mp HTTP/1.1,200
3,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js HTTP/1.1,200
4,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1,200


## Server Logs

In [28]:
data = pd.read_csv('../input/server-logs-suspicious/CIDDS-001-external-week1.csv')
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 16)
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P'],
      dtype='object')


Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,0,normal,---,---,---
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,0,normal,---,---,---
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,0,normal,---,---,---
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,0,normal,---,---,---
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,0,normal,---,---,---


In [29]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P
0,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
1,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
2,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
3,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
4,[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]


In [30]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P
0,2017-03-14 17:43:57.172,81412.697,TCP,EXT_SERVER,8082,OPENSTACK_NET,56978.0,3057,2.1 M,1,.AP...,0,normal,---,---,---
1,2017-03-14 17:43:57.172,81412.697,TCP,OPENSTACK_NET,56978,EXT_SERVER,8082.0,4748,2.5 M,1,.AP...,0,normal,---,---,---
2,2017-03-14 17:43:26.135,81504.787,TCP,EXT_SERVER,8082,OPENSTACK_NET,56979.0,8639,9.1 M,1,.AP...,0,normal,---,---,---
3,2017-03-14 17:43:26.135,81504.787,TCP,OPENSTACK_NET,56979,EXT_SERVER,8082.0,12024,10.3 M,1,.AP...,0,normal,---,---,---
4,2017-03-14 18:17:09.005,82100.692,TCP,EXT_SERVER,8082,OPENSTACK_NET,51649.0,11012,27.2 M,1,.AP.S.,0,normal,---,---,---


## Server Logs

In [31]:
data = pd.read_csv('../input/server-logs/logfiles.log')
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 2)
Index(['A', 'B'], dtype='object')


Unnamed: 0,A,B
0,"162.253.4.179 - - [27/Dec/2037:12:00:00 +0530] ""GET /usr/admin/developer HTTP/1.0"" 200 5041 ""http://www.parker-miller.org/tag/list/list/privacy/"" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML","like Gecko) Chrome/87.0.4280.141 Safari/537.36"" 3885"
1,"252.156.232.172 - - [27/Dec/2037:12:00:00 +0530] ""POST /usr/register HTTP/1.0"" 404 5028 ""-"" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML","like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"" 3350"
2,"182.215.249.159 - - [27/Dec/2037:12:00:00 +0530] ""PUT /usr/register HTTP/1.0"" 304 4936 ""http://www.parker-miller.org/tag/list/list/privacy/"" ""Mozilla/5.0 (Android 10; Mobile; rv:84.0) Gecko/84.0 Firefox/84.0"" 767",
3,"160.36.208.51 - - [27/Dec/2037:12:00:00 +0530] ""POST /usr HTTP/1.0"" 304 4979 ""http://www.parker-miller.org/tag/list/list/privacy/"" ""Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000) AppleWebKit/537.36 (KHTML","like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36 OPR/61.2.3076.56749"" 84"
4,"255.231.52.33 - - [27/Dec/2037:12:00:00 +0530] ""PUT /usr/admin/developer HTTP/1.0"" 403 5054 ""http://www.parker-miller.org/tag/list/list/privacy/"" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML","like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"" 3629"


In [32]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B
0,"[type: PERSON, start: 189, end: 207, score: 0.85, type: URL, start: 93, end: 145, score: 0.6, type: PHONE_NUMBER, start: 0, end: 13, score: 0.4]","[type: PHONE_NUMBER, start: 20, end: 33, score: 0.4]"
1,"[type: PERSON, start: 135, end: 153, score: 0.85]","[type: PHONE_NUMBER, start: 51, end: 64, score: 0.4]"
2,"[type: URL, start: 88, end: 140, score: 0.6]",[]
3,"[type: URL, start: 78, end: 130, score: 0.6, type: PHONE_NUMBER, start: 0, end: 13, score: 0.4]","[type: PHONE_NUMBER, start: 20, end: 33, score: 0.4]"
4,"[type: PERSON, start: 189, end: 207, score: 0.85, type: URL, start: 93, end: 145, score: 0.6, type: PHONE_NUMBER, start: 0, end: 13, score: 0.4]","[type: PHONE_NUMBER, start: 51, end: 64, score: 0.4]"


In [33]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B
0,"[31m<PHONE_NUMBER>[0m - - [27/Dec/2037:12:00:00 +0530] ""GET /usr/admin/developer HTTP/1.0"" 200 5041 ""[31m<URL>[0m ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) [31m<PERSON>[0m (KHTML","like Gecko) Chrome/[31m<PHONE_NUMBER>[0m Safari/537.36"" 3885"
1,"252.156.232.172 - - [27/Dec/2037:12:00:00 +0530] ""POST /usr/register HTTP/1.0"" 404 5028 ""-"" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) [31m<PERSON>[0m (KHTML","like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/[31m<PHONE_NUMBER>[0m"" 3350"
2,"182.215.249.159 - - [27/Dec/2037:12:00:00 +0530] ""PUT /usr/register HTTP/1.0"" 304 4936 ""[31m<URL>[0m ""Mozilla/5.0 (Android 10; Mobile; rv:84.0) Gecko/84.0 Firefox/84.0"" 767",
3,"[31m<PHONE_NUMBER>[0m - - [27/Dec/2037:12:00:00 +0530] ""POST /usr HTTP/1.0"" 304 4979 ""[31m<URL>[0m ""Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000) AppleWebKit/537.36 (KHTML","like Gecko) Chrome/[31m<PHONE_NUMBER>[0m Mobile Safari/537.36 OPR/61.2.3076.56749"" 84"
4,"[31m<PHONE_NUMBER>[0m - - [27/Dec/2037:12:00:00 +0530] ""PUT /usr/admin/developer HTTP/1.0"" 403 5054 ""[31m<URL>[0m ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) [31m<PERSON>[0m (KHTML","like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/[31m<PHONE_NUMBER>[0m"" 3629"


## Access Logs

In [34]:
data = pd.read_csv('../input/access-log/access_log.txt', sep='- -', header=None)
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 2)
Index(['A', 'B'], dtype='object')


Unnamed: 0,A,B
0,10.223.157.186,"[15/Jul/2009:14:58:59 -0700] ""GET / HTTP/1.1"" 403 202"
1,10.223.157.186,"[15/Jul/2009:14:58:59 -0700] ""GET /favicon.ico HTTP/1.1"" 404 209"
2,10.223.157.186,"[15/Jul/2009:15:50:35 -0700] ""GET / HTTP/1.1"" 200 9157"
3,10.223.157.186,"[15/Jul/2009:15:50:35 -0700] ""GET /assets/js/lowpro.js HTTP/1.1"" 200 10469"
4,10.223.157.186,"[15/Jul/2009:15:50:35 -0700] ""GET /assets/css/reset.css HTTP/1.1"" 200 1014"


In [35]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B
0,[],[]
1,[],[]
2,[],"[type: PHONE_NUMBER, start: 20, end: 28, score: 0.4]"
3,[],"[type: PHONE_NUMBER, start: 20, end: 28, score: 0.4]"
4,[],"[type: PHONE_NUMBER, start: 20, end: 28, score: 0.4]"


In [36]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B
0,10.223.157.186,"[15/Jul/2009:14:58:59 -0700] ""GET / HTTP/1.1"" 403 202"
1,10.223.157.186,"[15/Jul/2009:14:58:59 -0700] ""GET /favicon.ico HTTP/1.1"" 404 209"
2,10.223.157.186,"[15/Jul/2009:15:50:[31m<PHONE_NUMBER>[0m] ""GET / HTTP/1.1"" 200 9157"
3,10.223.157.186,"[15/Jul/2009:15:50:[31m<PHONE_NUMBER>[0m] ""GET /assets/js/lowpro.js HTTP/1.1"" 200 10469"
4,10.223.157.186,"[15/Jul/2009:15:50:[31m<PHONE_NUMBER>[0m] ""GET /assets/css/reset.css HTTP/1.1"" 200 1014"


## Web Logs

In [37]:
data = pd.read_csv('../input/nasa-website-data/nasa_aug95_c.csv')
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 5)
Index(['A', 'B', 'C', 'D', 'E'], dtype='object')


Unnamed: 0,A,B,C,D,E
0,in24.inetnebr.com,1995-08-01T00:00:01+00:00,GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0,200,1839.0
1,uplherc.upl.com,1995-08-01T00:00:07+00:00,GET / HTTP/1.0,304,0.0
2,uplherc.upl.com,1995-08-01T00:00:08+00:00,GET /images/ksclogo-medium.gif HTTP/1.0,304,0.0
3,uplherc.upl.com,1995-08-01T00:00:08+00:00,GET /images/MOSAIC-logosmall.gif HTTP/1.0,304,0.0
4,uplherc.upl.com,1995-08-01T00:00:08+00:00,GET /images/USA-logosmall.gif HTTP/1.0,304,0.0


In [38]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B,C,D,E
0,"[type: URL, start: 0, end: 17, score: 0.5]",[],[],[],[]
1,"[type: URL, start: 0, end: 15, score: 0.5]",[],[],[],[]
2,"[type: URL, start: 0, end: 15, score: 0.5]",[],"[type: URL, start: 12, end: 29, score: 0.5]",[],[]
3,"[type: URL, start: 0, end: 15, score: 0.5]",[],"[type: URL, start: 12, end: 31, score: 0.5]",[],[]
4,"[type: URL, start: 0, end: 15, score: 0.5]",[],"[type: URL, start: 12, end: 28, score: 0.5]",[],[]


In [39]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B,C,D,E
0,[31m<URL>[0m,1995-08-01T00:00:01+00:00,GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0,200,1839.0
1,[31m<URL>[0m,1995-08-01T00:00:07+00:00,GET / HTTP/1.0,304,0.0
2,[31m<URL>[0m,1995-08-01T00:00:08+00:00,GET /images/[31m<URL>[0mf HTTP/1.0,304,0.0
3,[31m<URL>[0m,1995-08-01T00:00:08+00:00,GET /images/[31m<URL>[0mf HTTP/1.0,304,0.0
4,[31m<URL>[0m,1995-08-01T00:00:08+00:00,GET /images/[31m<URL>[0mf HTTP/1.0,304,0.0


## SSH Login Attempts Logs

In [40]:
data = pd.read_csv('../input/ssh-login-attempts-on-my-raspberry-pi/ssh_login_attempts.csv')
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 6)
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


Unnamed: 0,A,B,C,D,E,F
0,Mar,6,06:25:24,root,20.187.88.188,59126
1,Mar,6,06:25:25,leonardo,161.82.233.179,44304
2,Mar,6,06:25:35,master,49.234.24.246,50730
3,Mar,6,06:25:37,root,183.88.189.109,50401
4,Mar,6,06:26:07,root,154.221.19.60,53614


In [41]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A,B,C,D,E,F
0,[],[],[],[],"[type: PHONE_NUMBER, start: 0, end: 13, score: 0.4]",[]
1,[],[],[],[],"[type: PHONE_NUMBER, start: 0, end: 14, score: 0.4]",[]
2,[],[],[],[],"[type: PHONE_NUMBER, start: 0, end: 13, score: 0.4]",[]
3,[],[],[],[],"[type: PHONE_NUMBER, start: 0, end: 14, score: 0.4]",[]
4,[],[],[],[],"[type: PHONE_NUMBER, start: 0, end: 13, score: 0.4]",[]


In [42]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A,B,C,D,E,F
0,Mar,6,06:25:24,root,[31m<PHONE_NUMBER>[0m,59126
1,Mar,6,06:25:25,leonardo,[31m<PHONE_NUMBER>[0m,44304
2,Mar,6,06:25:35,master,[31m<PHONE_NUMBER>[0m,50730
3,Mar,6,06:25:37,root,[31m<PHONE_NUMBER>[0m,50401
4,Mar,6,06:26:07,root,[31m<PHONE_NUMBER>[0m,53614


## SSH Logs

In [43]:
data = pd.read_csv('../input/loghub-ssh-log-data/SSH.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,Dec 10 06:55:46 LabSZ sshd[24200]: reverse mapping checking getaddrinfo for ns.marryaldkfaczcz.com [173.234.31.186] failed - POSSIBLE BREAK-IN ATTEMPT!
1,Dec 10 06:55:46 LabSZ sshd[24200]: Invalid user webmaster from 173.234.31.186
2,Dec 10 06:55:46 LabSZ sshd[24200]: input_userauth_request: invalid user webmaster [preauth]
3,Dec 10 06:55:46 LabSZ sshd[24200]: pam_unix(sshd:auth): check pass; user unknown
4,Dec 10 06:55:46 LabSZ sshd[24200]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=173.234.31.186


In [44]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,"[type: URL, start: 76, end: 98, score: 0.5, type: PHONE_NUMBER, start: 99, end: 114, score: 0.4]"
1,"[type: PHONE_NUMBER, start: 63, end: 77, score: 0.4]"
2,[]
3,[]
4,"[type: PERSON, start: 89, end: 137, score: 0.85, type: PHONE_NUMBER, start: 123, end: 137, score: 0.4]"


In [45]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,Dec 10 06:55:46 LabSZ sshd[24200]: reverse mapping checking getaddrinfo for [31m<URL>[0m [31m<PHONE_NUMBER>[0m] failed - POSSIBLE BREAK-IN ATTEMPT!
1,Dec 10 06:55:46 LabSZ sshd[24200]: Invalid user webmaster from [31m<PHONE_NUMBER>[0m
2,Dec 10 06:55:46 LabSZ sshd[24200]: input_userauth_request: invalid user webmaster [preauth]
3,Dec 10 06:55:46 LabSZ sshd[24200]: pam_unix(sshd:auth): check pass; user unknown
4,Dec 10 06:55:46 LabSZ sshd[24200]: pam_unix(sshd:auth): authentication failure; logname= [31m<PERSON>[0m


## Apache Logs

In [46]:
data = pd.read_csv('../input/loghub-apache-log-data/Apache.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,[Thu Jun 09 06:07:04 2005] [notice] LDAP: Built with OpenLDAP LDAP SDK
1,[Thu Jun 09 06:07:04 2005] [notice] LDAP: SSL support unavailable
2,[Thu Jun 09 06:07:04 2005] [notice] suEXEC mechanism enabled (wrapper: /usr/sbin/suexec)
3,[Thu Jun 09 06:07:05 2005] [notice] Digest: generating secret for digest authentication ...
4,[Thu Jun 09 06:07:05 2005] [notice] Digest: done


In [47]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,[]
1,[]
2,[]
3,[]
4,[]


In [48]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,[Thu Jun 09 06:07:04 2005] [notice] LDAP: Built with OpenLDAP LDAP SDK
1,[Thu Jun 09 06:07:04 2005] [notice] LDAP: SSL support unavailable
2,[Thu Jun 09 06:07:04 2005] [notice] suEXEC mechanism enabled (wrapper: /usr/sbin/suexec)
3,[Thu Jun 09 06:07:05 2005] [notice] Digest: generating secret for digest authentication ...
4,[Thu Jun 09 06:07:05 2005] [notice] Digest: done


## Android Logs

In [49]:
data = pd.read_csv('../input/loghub-android-log-data/Android.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,12-17 19:31:36.263 1795 1825 I PowerManager_screenOn: DisplayPowerStatesetColorFadeLevel: level=1.0
1,"12-17 19:31:36.263 5224 5283 I SendBroadcastPermission: action:android.com.huawei.bone.NOTIFY_SPORT_DATA, mPermissionType:0"
2,"12-17 19:31:36.264 1795 1825 D DisplayPowerController: Animating brightness: target=21, rate=40"
3,"12-17 19:31:36.264 1795 1825 I PowerManager_screenOn: DisplayPowerController updatePowerState mPendingRequestLocked=policy=BRIGHT, useProximitySensor=true, useProximitySensorbyPhone=true, screenBrightness=33, screenAutoBrightnessAdjustment=0.0, brightnessSetByUser=true, useAutoBrightness=true, blockScreenOn=false, lowPowerMode=false, boostScreenBrightness=false, dozeScreenBrightness=-1, dozeScreenState=UNKNOWN, useTwilight=false, useSmartBacklight=true, brightnessWaitMode=false, brightnessWaitRet=true, screenAutoBrightness=-1, userId=0"
4,"12-17 19:31:36.264 1795 2750 I PowerManager_screenOn: DisplayPowerState Updating screen state: state=ON, backlight=823"


In [50]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,[]
1,"[type: URL, start: 65, end: 91, score: 0.5]"
2,[]
3,[]
4,[]


In [51]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,12-17 19:31:36.263 1795 1825 I PowerManager_screenOn: DisplayPowerStatesetColorFadeLevel: level=1.0
1,"12-17 19:31:36.263 5224 5283 I SendBroadcastPermission: action:[31m<URL>[0mTIFY_SPORT_DATA, mPermissionType:0"
2,"12-17 19:31:36.264 1795 1825 D DisplayPowerController: Animating brightness: target=21, rate=40"
3,"12-17 19:31:36.264 1795 1825 I PowerManager_screenOn: DisplayPowerController updatePowerState mPendingRequestLocked=policy=BRIGHT, useProximitySensor=true, useProximitySensorbyPhone=true, screenBrightness=33, screenAutoBrightnessAdjustment=0.0, brightnessSetByUser=true, useAutoBrightness=true, blockScreenOn=false, lowPowerMode=false, boostScreenBrightness=false, dozeScreenBrightness=-1, dozeScreenState=UNKNOWN, useTwilight=false, useSmartBacklight=true, brightnessWaitMode=false, brightnessWaitRet=true, screenAutoBrightness=-1, userId=0"
4,"12-17 19:31:36.264 1795 2750 I PowerManager_screenOn: DisplayPowerState Updating screen state: state=ON, backlight=823"


## MacOS Logs

In [53]:
data = pd.read_csv('../input/loghub-macos-log-data/Mac.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: AppleThunderboltNHIType2::prePCIWake - power up complete - took 2 us
1,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: AppleThunderboltGenericHAL::earlyWake - complete - took 0 milliseconds
2,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: AirPort: Link Down on awdl0. Reason 1 (Unspecified).
3,"Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: ARPT: 620651.021206: wl0: wl_update_tcpkeep_seq: Original Seq: 2477329075, Ack: 1662858865, Win size: 4096"
4,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: Bluetooth -- LE is supported - Disable LE meta event


In [54]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,[]
1,[]
2,[]
3,"[type: PHONE_NUMBER, start: 115, end: 125, score: 0.4]"
4,[]


In [55]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: AppleThunderboltNHIType2::prePCIWake - power up complete - took 2 us
1,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: AppleThunderboltGenericHAL::earlyWake - complete - took 0 milliseconds
2,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: AirPort: Link Down on awdl0. Reason 1 (Unspecified).
3,"Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: ARPT: 620651.021206: wl0: wl_update_tcpkeep_seq: Original Seq: [31m<PHONE_NUMBER>[0m, Ack: 1662858865, Win size: 4096"
4,Jul 1 09:00:55 calvisitor-10-105-160-95 kernel[0]: Bluetooth -- LE is supported - Disable LE meta event


## HPC Logs

In [56]:
data = pd.read_csv('../input/loghub-hpc-log-data/HPC.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,460903 resourcemgmtdaeomon node-25 server subsys 1145552216 1 failed to configure resourcemgmt subsystem err = 10
1,460919 resourcemgmtdaeomon node-25 server subsys 1145552221 1 failed to configure resourcemgmt subsystem err = 10
2,460932 resourcemgmtdaeomon node-25 server subsys 1145552226 1 failed to configure resourcemgmt subsystem err = 10
3,460949 resourcemgmtdaeomon node-25 server subsys 1145552232 1 failed to configure resourcemgmt subsystem err = 10
4,460971 resourcemgmtdaeomon node-25 server subsys 1145552237 1 failed to configure resourcemgmt subsystem err = 10


In [57]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,"[type: PHONE_NUMBER, start: 49, end: 59, score: 0.4]"
1,"[type: PHONE_NUMBER, start: 49, end: 59, score: 0.4]"
2,"[type: PHONE_NUMBER, start: 49, end: 59, score: 0.4]"
3,"[type: PHONE_NUMBER, start: 49, end: 59, score: 0.4]"
4,"[type: PHONE_NUMBER, start: 49, end: 59, score: 0.4]"


In [58]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,460903 resourcemgmtdaeomon node-25 server subsys [31m<PHONE_NUMBER>[0m 1 failed to configure resourcemgmt subsystem err = 10
1,460919 resourcemgmtdaeomon node-25 server subsys [31m<PHONE_NUMBER>[0m 1 failed to configure resourcemgmt subsystem err = 10
2,460932 resourcemgmtdaeomon node-25 server subsys [31m<PHONE_NUMBER>[0m 1 failed to configure resourcemgmt subsystem err = 10
3,460949 resourcemgmtdaeomon node-25 server subsys [31m<PHONE_NUMBER>[0m 1 failed to configure resourcemgmt subsystem err = 10
4,460971 resourcemgmtdaeomon node-25 server subsys [31m<PHONE_NUMBER>[0m 1 failed to configure resourcemgmt subsystem err = 10


## Health App Logs

In [59]:
data = pd.read_csv('../input/loghub-healthapp-log-data/HealthApp.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,20171223-22:15:29:606|Step_LSC|30002312|onStandStepChanged 3579
1,20171223-22:15:29:615|Step_LSC|30002312|onExtend:1514038530000 14 0 4
2,20171223-22:15:29:633|Step_StandReportReceiver|30002312|onReceive action: android.intent.action.SCREEN_ON
3,20171223-22:15:29:635|Step_LSC|30002312|processHandleBroadcastAction action:android.intent.action.SCREEN_ON
4,20171223-22:15:29:635|Step_StandStepCounter|30002312|flush sensor data


In [60]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,"[type: PHONE_NUMBER, start: 0, end: 11, score: 0.4, type: PHONE_NUMBER, start: 31, end: 39, score: 0.4]"
1,"[type: PHONE_NUMBER, start: 0, end: 11, score: 0.4, type: PHONE_NUMBER, start: 31, end: 39, score: 0.4]"
2,"[type: URL, start: 74, end: 98, score: 0.5, type: PHONE_NUMBER, start: 0, end: 11, score: 0.4, type: PHONE_NUMBER, start: 47, end: 55, score: 0.4]"
3,"[type: URL, start: 76, end: 100, score: 0.5, type: PHONE_NUMBER, start: 0, end: 11, score: 0.4, type: PHONE_NUMBER, start: 31, end: 39, score: 0.4]"
4,"[type: PHONE_NUMBER, start: 0, end: 11, score: 0.4, type: PHONE_NUMBER, start: 44, end: 52, score: 0.4]"


In [61]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,[31m<PHONE_NUMBER>[0m:15:29:606|Step_LSC|[31m<PHONE_NUMBER>[0m|onStandStepChanged 3579
1,[31m<PHONE_NUMBER>[0m:15:29:615|Step_LSC|[31m<PHONE_NUMBER>[0m|onExtend:1514038530000 14 0 4
2,[31m<PHONE_NUMBER>[0m:15:29:633|Step_StandReportReceiver|[31m<PHONE_NUMBER>[0m|onReceive action: [31m<URL>[0mREEN_ON
3,[31m<PHONE_NUMBER>[0m:15:29:635|Step_LSC|[31m<PHONE_NUMBER>[0m|processHandleBroadcastAction action:[31m<URL>[0mREEN_ON
4,[31m<PHONE_NUMBER>[0m:15:29:635|Step_StandStepCounter|[31m<PHONE_NUMBER>[0m|flush sensor data


## OpenStack Abnormal Logs

In [62]:
data = pd.read_csv('../input/loghub-openstack-log-data/openstack_abnormal.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,"nova-api.log.2017-05-14_21:27:04 2017-05-14 19:39:01.445 25746 INFO nova.osapi_compute.wsgi.server [req-5a2050e7-b381-4ae9-92d2-8b08e9f9f4c0 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1583 time: 0.1919448"
1,"nova-api.log.2017-05-14_21:27:04 2017-05-14 19:39:01.650 25746 INFO nova.osapi_compute.wsgi.server [req-c26a7d54-55ab-412e-947f-421a2cb934fc 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/3edec1e4-9678-4a3a-a21b-a145a4ee5e61 HTTP/1.1"" status: 200 len: 1708 time: 0.2011580"
2,nova-compute.log.2017-05-14_21:27:09 2017-05-14 19:39:02.007 2931 INFO nova.virt.libvirt.driver [req-e285b551-587f-4c1d-8eba-dceb2673637f 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 3edec1e4-9678-4a3a-a21b-a145a4ee5e61] Creating image
3,"nova-api.log.2017-05-14_21:27:04 2017-05-14 19:39:02.924 25746 INFO nova.osapi_compute.wsgi.server [req-eb681812-78ae-4a9f-9e2a-96e505285512 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1759 time: 0.2698390"
4,nova-compute.log.2017-05-14_21:27:09 2017-05-14 19:39:03.166 2931 INFO nova.compute.manager [-] [instance: 2b590f10-49fd-4ec9-ae41-19596c2f4b25] VM Stopped (Lifecycle Event)


In [63]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,"[type: URL, start: 79, end: 94, score: 0.5]"
1,"[type: URL, start: 79, end: 94, score: 0.5, type: PHONE_NUMBER, start: 350, end: 359, score: 0.4]"
2,"[type: URL, start: 71, end: 83, score: 0.5, type: PHONE_NUMBER, start: 54, end: 65, score: 0.4]"
3,"[type: URL, start: 79, end: 94, score: 0.5, type: PHONE_NUMBER, start: 50, end: 62, score: 0.4, type: PHONE_NUMBER, start: 320, end: 329, score: 0.4]"
4,"[type: URL, start: 71, end: 86, score: 0.5, type: PHONE_NUMBER, start: 54, end: 65, score: 0.4]"


In [64]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,"nova-api.log.2017-05-14_21:27:04 2017-05-14 19:39:01.445 25746 INFO nova.osapi_[31m<URL>[0mrver [req-5a2050e7-b381-4ae9-92d2-8b08e9f9f4c0 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1583 time: 0.1919448"
1,"nova-api.log.2017-05-14_21:27:04 2017-05-14 19:39:01.650 25746 INFO nova.osapi_[31m<URL>[0mrver [req-c26a7d54-55ab-412e-947f-421a2cb934fc 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/3edec1e4-9678-4a3a-a21b-a145a4ee5e61 HTTP/1.1"" status: 200 len: 1708 time: [31m<PHONE_NUMBER>[0m"
2,nova-compute.log.2017-05-14_21:27:09 2017-05-14 19:39:[31m<PHONE_NUMBER>[0m INFO [31m<URL>[0mbvirt.driver [req-e285b551-587f-4c1d-8eba-dceb2673637f 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] [instance: 3edec1e4-9678-4a3a-a21b-a145a4ee5e61] Creating image
3,"nova-api.log.2017-05-14_21:27:04 2017-05-14 19:39:[31m<PHONE_NUMBER>[0m INFO nova.osapi_[31m<URL>[0mrver [req-eb681812-78ae-4a9f-9e2a-96e505285512 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1759 time: [31m<PHONE_NUMBER>[0m"
4,nova-compute.log.2017-05-14_21:27:09 2017-05-14 19:39:[31m<PHONE_NUMBER>[0m INFO [31m<URL>[0mnager [-] [instance: 2b590f10-49fd-4ec9-ae41-19596c2f4b25] VM Stopped (Lifecycle Event)


## OpenStack Normal Logs

In [65]:
data = pd.read_csv('../input/loghub-openstack-log-data/openstack_normal1.log', sep = '\n', names = ['Log_contents'])
data = data[:1000]
data.columns = [chr(65 + i) for i in range(len(data.columns))]
data = data.astype(str)
print(data.shape)
print(data.columns)
data.head()

(1000, 1)
Index(['A'], dtype='object')


Unnamed: 0,A
0,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:00.008 25746 INFO nova.osapi_compute.wsgi.server [req-38101a0b-2096-447d-96ea-a692162415ae 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2477829"
1,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:00.272 25746 INFO nova.osapi_compute.wsgi.server [req-9bc36dd9-91c5-4314-898a-47625eb93b09 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2577181"
2,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:01.551 25746 INFO nova.osapi_compute.wsgi.server [req-55db2d8d-cdb7-4b4b-993b-429be84c0c3e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2731631"
3,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:01.813 25746 INFO nova.osapi_compute.wsgi.server [req-2a3dc421-6604-42a7-9390-a18dc824d5d6 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2580249"
4,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:03.091 25746 INFO nova.osapi_compute.wsgi.server [req-939eb332-c1c1-4e67-99b8-8695f8f1980a 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2727931"


In [66]:
analyzed_data = analyze_and_store_results(data.copy(), analyzer)
analyzed_data.head()

Unnamed: 0,A
0,"[type: URL, start: 81, end: 96, score: 0.5, type: PHONE_NUMBER, start: 322, end: 331, score: 0.4]"
1,"[type: URL, start: 81, end: 96, score: 0.5, type: PHONE_NUMBER, start: 322, end: 331, score: 0.4]"
2,"[type: URL, start: 81, end: 96, score: 0.5, type: PHONE_NUMBER, start: 322, end: 331, score: 0.4]"
3,"[type: URL, start: 81, end: 96, score: 0.5, type: PHONE_NUMBER, start: 52, end: 64, score: 0.4, type: PHONE_NUMBER, start: 322, end: 331, score: 0.4]"
4,"[type: URL, start: 81, end: 96, score: 0.5, type: PHONE_NUMBER, start: 52, end: 64, score: 0.4, type: PHONE_NUMBER, start: 322, end: 331, score: 0.4]"


In [67]:
# Create a new DataFrame to store the processed results
processed_data = pd.DataFrame()

# Iterate over each row and column in the DataFrame
for row in data.itertuples():
    for col in data.columns:
        
        # Access cell value and corresponding analyzer results
        text = getattr(row, col)  # Access cell value
        analyzer_results = analyzed_data.loc[row.Index, col]  # Get corresponding analyzer results
        
        # Process text in the cell
        processed_text, analyzer_output = process_cell(text, analyzer_results)
        
        # Store the processed text in the new DataFrame
        processed_data.loc[row.Index, col] = processed_text
        
# Display the first few rows of the processed DataFrame
processed_data.head()

Unnamed: 0,A
0,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:00.008 25746 INFO nova.osapi_[31m<URL>[0mrver [req-38101a0b-2096-447d-96ea-a692162415ae 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: [31m<PHONE_NUMBER>[0m"
1,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:00.272 25746 INFO nova.osapi_[31m<URL>[0mrver [req-9bc36dd9-91c5-4314-898a-47625eb93b09 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: [31m<PHONE_NUMBER>[0m"
2,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:01.551 25746 INFO nova.osapi_[31m<URL>[0mrver [req-55db2d8d-cdb7-4b4b-993b-429be84c0c3e 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: [31m<PHONE_NUMBER>[0m"
3,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:[31m<PHONE_NUMBER>[0m INFO nova.osapi_[31m<URL>[0mrver [req-2a3dc421-6604-42a7-9390-a18dc824d5d6 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: [31m<PHONE_NUMBER>[0m"
4,"nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:[31m<PHONE_NUMBER>[0m INFO nova.osapi_[31m<URL>[0mrver [req-939eb332-c1c1-4e67-99b8-8695f8f1980a 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: [31m<PHONE_NUMBER>[0m"
