In [None]:
import re
from collections import Counter

#Function to parse log entries
def parse_log_entry(log_entry):
    pattern = r'(\S+) (\S+) (\S+) \[([^\]]+)\] "(\S+) (\S+) (\S+)" (\d+) (\d+)'
    match = re.match(pattern, log_entry)

    if match:
        ip, _, _, timestamp, method, path, protocol, status_code, _ = match.groups()
        return {'ip': ip, 'timestamp': timestamp, 'method': method, 'path': path, 'protocol': protocol, 'status_code': status_code}
    else:
        return None

#Function to analyze log entries
def analyze_log(log_entries):
    total_requests = len(log_entries)

    #Extracted information counters
    methods = Counter()
    status_codes = Counter()
    unique_ips = set()
    popular_urls = Counter()

    for entry in log_entries:
      parsed_entry = parse_log_entry(entry)
      if parsed_entry:
        methods[parsed_entry['method']] += 1
        status_codes[parsed_entry['status_code']] += 1
        unique_ips.add(parsed_entry['ip'])
        popular_urls[parsed_entry['path']] += 1
    return {
        'total_requests': total_requests,
        'methods': dict(methods),
        'status_codes': dict(status_codes),
        'unique_ips': len(unique_ips),
        'popular_urls': dict(popular_urls.most_common(5))
    }

#Example usage
log_file_path = r"/content/access1.log"
with open(log_file_path, 'r') as file:
    log_entries = file.readlines()
print(log_entries[0:5])

analysis_results = analyze_log(log_entries)

#Display results
print("Total Requests:", analysis_results['total_requests'])
print("Unique IPs:", analysis_results['unique_ips'])
print("\nHTTP Methods:")
for method, count in analysis_results['methods'].items():
    print(f"{method}: {count}")
print("\nStatus Codes:")
for status_code, count in analysis_results['status_codes'].items():
    print(f"{status_code}: {count}")
print("\nTop 5 Popular URLs:")
for url, count in analysis_results['popular_urls'].items():
    print(f"{url}: {count}")

['%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1" 200 30577 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)" "-"\n', '31.56.96.51 - - [22/Jan/2019:03:56:16 +0330] "GET /image/60844/productModel/200x200 HTTP/1.1" 200 5667 "https://www.zanbil.ir/m/filter/b113" "Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build/HuaweiALE-L21) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36" "-"\n', '31.56.96.51 - - [22/Jan/2019:03:56:16 +0330] "GET /image/61474/productModel/200x200 HTTP/1.1" 200 5379 "https://www.zanbil.ir/m/filter/b113" "Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build/HuaweiALE-L21) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36" "-"\n', '40.77.167.129 - - [22/Jan/2019:03:56:17 +0330] "GET /image/14925/productModel/100x100 HTTP/1.1" 200 1696 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" "-"\n', '91.99.72.15 - - [22/Jan/2019:03:56:17 +0330] "GET /product/31893/62100/%D8%B3%D

In [None]:
import pandas as pd
from collections import Counter

#Read CSV file
csv_file_path = r"/content/vodclickstream_uk_movies_03.csv"
df = pd.read_csv(csv_file_path)

#Display basic information about the dataset
print("Dataset Overview:")
print(df.head())
print("\nColumns:",df.columns)
print("\nData Types:",df.dtypes)

#Extrct relevant information for clickstream analysis
user_clicks = df[['title','user_id','movie_id','datetime']]

#Calculate the number of clicks per user
clicks_per_user = user_clicks.groupby('user_id').size()

#calculate the most popular movies
popular_movies = user_clicks.groupby('title').size().sort_values(ascending=False)

#Display results
print("\nClicks Per User:")
print(clicks_per_user)

print("\nMost Popular Movies:")
print(popular_movies.head(10))

Dataset Overview:
   Unnamed: 0          datetime  duration                               title  \
0     58773.0  01-01-2017 01:15       0.0  Angus, Thongs and Perfect Snogging   
1     58774.0  01-01-2017 13:56       0.0        The Curse of Sleeping Beauty   
2     58775.0  01-01-2017 15:17   10530.0                   London Has Fallen   
3     58776.0  01-01-2017 16:04      49.0                            Vendetta   
4     58777.0  01-01-2017 19:16       0.0     The SpongeBob SquarePants Movie   

                                              genres release_date    movie_id  \
0                             Comedy, Drama, Romance   25-07-2008  26bd5987e8   
1                 Fantasy, Horror, Mystery, Thriller   02-06-2016  f26ed2675e   
2                                   Action, Thriller   04-03-2016  f77e500e7a   
3                                      Action, Drama   12-06-2015  c74aec7673   
4  Animation, Action, Adventure, Comedy, Family, ...   19-11-2004  a80d6fc2aa   

      us

In [None]:
from urllib import parse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from collections import Counter

def get_links_from_url(url):
  try:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True)]
    return links
  except Exception as e:
    print(f"Error retreiving links from {url}: {e}")
    return []

def analyze_links(links):
  parsed_links = [urlparse(link) for link in links]
  hosts = [parsed_link.netloc for parsed_link in parsed_links]
  paths = [parsed_link.path for parsed_link in parsed_links]

  host_counts = Counter(hosts)
  path_counts = Counter(paths)

  return host_counts, path_counts

def main():
  target_url = 'https://youtube.com'
  links = get_links_from_url(target_url)
  host_counts, path_counts = analyze_links(links)

  print("Host Counts:")
  print(host_counts)

  print("\nPath Counts:")
  print(path_counts)

if __name__ == "__main__":
  main()

Host Counts:
Counter({'www.youtube.com': 7, '': 6, 'developers.google.com': 1, 'tv.youtube.com': 1})

Path Counts:
Counter({'/': 2, '/about/': 1, '/about/press/': 1, '/about/copyright/': 1, '/t/contact_us/': 1, '/creators/': 1, '/ads/': 1, '/youtube': 1, '/t/terms': 1, '/t/privacy': 1, '/about/policies/': 1, '/howyoutubeworks': 1, '/new': 1, '/learn/nflsundayticket': 1})
