In [1]:
import pandas as pd

# Path to the extracted file
file_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/testing1st/leaderboard.test.data.20130318_20.txt'  # Replace with actual path

# Load the dataset (assuming tab-separated values)
data = pd.read_csv(file_path, sep='\t', header=None)  # Adjust 'header' if there are headers in the file

# Explore the first few rows
print("First few rows of the dataset:")
print(data.head())

# Get summary information of the dataset
print("\nDataset Info:")
data.info()

# If there are categorical columns, you can preprocess them like so:
from sklearn.preprocessing import LabelEncoder

# Example preprocessing (adjust column indices based on data)
categorical_columns = [1, 2, 3]  # Replace with actual indices of categorical columns
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))  # Convert categorical to numerical
    label_encoders[col] = le

# The data is now ready for model training!

First few rows of the dataset:
                                 0                  1   2   \
0  b006d2236cd8786f8269e3bd60dd795f  20130318000100765   1   
1  72f80b647a1ee74dd131c00f5dbb0174  20130318000100802   1   
2  6f8cee7aecd741b28ad303e9bb9cf1eb  20130318000100827   1   
3  58fcbfa5f1c6381a207456157301d9d3  20130318000100837   1   
4  f6c97f428a534bc515a52e1eab9b53cf  20130318000100840   1   

                                 3   \
0  c43a65560b3c8adc7cfe24ef38e82398   
1  d09e956039c402a4c36513920c06bc84   
2  519a2259e67cc42c5d422a16a28b5b0a   
3  46f2497b959b6e5f331032c5a8985281   
4  6fa09a1916b9eb75010f756e4fb276fb   

                                                  4              5   6    7   \
0  Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...   60.190.192.*  94  100   
1  Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1...    175.170.0.*  40   42   
2  Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1...  123.186.178.*  40   45   
3  Mozilla/4.0 (compatible; MSIE 6.0; Win

In [5]:
import os
import csv

def process_bid_files(datapath):
    # List all decompressed bid files (assumes the files have been decompressed)
    bid_files = [f for f in os.listdir(datapath) if f.startswith('bid') and f.endswith('.txt')]

    for bid_file in bid_files:
        # Convert each bid file to CSV
        bid_file_path = os.path.join(datapath, bid_file)
        csv_file_path = bid_file_path.replace('.txt', '.csv')
        
        print(f"Processing {bid_file_path} -> {csv_file_path}")
        
        with open(csv_file_path, 'w', newline='') as csv_out:
            spam_writer = csv.writer(csv_out, dialect='excel')
            
            with open(bid_file_path, 'r') as log_in:
                for line in log_in:
                    line_list = line.strip('\n').split('\t')
                    spam_writer.writerow(line_list)
        
        print(f"Completed processing {bid_file_path}")

if __name__ == '__main__':
    # Data path setup (adjust as necessary)
    data_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st'
    
    # Process the bid files after decompression
    process_bid_files(data_path)
    
    # Now you can proceed with further data preparation using the CSV files

In [10]:
import os
import csv
import operator
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Function to set a random seed for reproducibility
def setup_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

# Function to convert the time into fractional intervals
def to_time_frac(hour, minute, time_frac_dict):
    for key in time_frac_dict[hour].keys():
        if key[0] <= minute <= key[1]:
            return str(time_frac_dict[hour][key])

# Function to transform and encode features based on specific rules
def feat_trans(name, content, oses, browsers):
    content = content.lower()
    if name == "useragent":
        # Handle OS and browser detection
        operation = next((o for o in oses if o in content), "other")
        browser = next((b for b in browsers if b in content), "other")
        return operation + "_" + browser
    if name == "slotprice":
        price = int(content)
        if price > 100:
            return "101+"
        elif price > 50:
            return "51-100"
        elif price > 10:
            return "11-50"
        elif price > 0:
            return "1-10"
        else:
            return "0"

# Main function to prepare data (can be extended for different formats)
def to_libsvm_encode(datapath, time_frac_dict):
    print('Converting to LIBSVM encoding...')
    
    # Known OSes and Browsers
    oses = ["windows", "ios", "mac", "android", "linux"]
    browsers = ["chrome", "sogou", "maxthon", "safari", "firefox", "theworld", "opera", "ie"]

    # Feature columns that require encoding
    f1s = ["weekday", "hour", "IP", "region", "city", "adexchange", "domain", "slotid", "slotwidth", "slotheight",
           "slotvisibility", "slotformat", "creative", "advertiser"]
    f1sp = ["useragent", "slotprice"]

    # Load the CSV (converted from the log file earlier)
    fi = open(os.path.join(datapath, 'train.bid.csv'), 'r')

    # Initialize feature indices
    featindex = {}
    maxindex = 0  # Keep track of the max index

    first = True
    namecol = {}  # Column name index mapping

    # Processing training data and generating feature indices
    for line in fi:
        s = line.split(',')
        if first:
            first = False
            for i in range(len(s)):
                namecol[s[i].strip()] = i
                if i > 0:
                    featindex[str(i) + ':other'] = maxindex
                    maxindex += 1
            continue

        # Process each feature in `f1s`
        for f in f1s:
            col = namecol[f]
            content = s[col]
            feat = str(col) + ':' + content
            if feat not in featindex:
                featindex[feat] = maxindex
                maxindex += 1

        # Process transformed features in `f1sp`
        for f in f1sp:
            col = namecol[f]
            content = feat_trans(f, s[col], oses, browsers)
            feat = str(col) + ':' + content
            if feat not in featindex:
                featindex[feat] = maxindex
                maxindex += 1

    print('Feature size:', maxindex)
    return featindex  # Return the feature index dictionary for further processing

# Time Fraction Dictionary (24 hours, each hour split into 15-minute intervals)
def create_time_frac_dict():
    time_frac_dict = {}
    count = 0
    for i in range(24):
        hour_frac_dict = {}
        for item in [(0, 15), (15, 30), (30, 45), (45, 60)]:
            hour_frac_dict[item] = count
            count += 1
        time_frac_dict[i] = hour_frac_dict
    return time_frac_dict

# Function to convert bid log files into CSV format
def process_bid_files(datapath):
    # List all decompressed bid files
    bid_files = [f for f in os.listdir(datapath) if f.startswith('bid') and f.endswith('.txt')]

    for bid_file in bid_files:
        # Convert each bid file to CSV
        bid_file_path = os.path.join(datapath, bid_file)
        csv_file_path = bid_file_path.replace('.txt', '.csv')
        
        print(f"Processing {bid_file_path} -> {csv_file_path}")
        
        with open(csv_file_path, 'w', newline='') as csv_out:
            spam_writer = csv.writer(csv_out, dialect='excel')
            
            with open(bid_file_path, 'r') as log_in:
                for line in log_in:
                    line_list = line.strip('\n').split('\t')
                    spam_writer.writerow(line_list)
        
        print(f"Completed processing {bid_file_path}")

if __name__ == '__main__':
    # Data path setup (example path, adjust as necessary)
    data_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st'
    
    # Process the bid files after decompression
    process_bid_files(data_path)
    
    # Time Frac Dictionary for 15-minute intervals
    time_frac_dict = create_time_frac_dict()

    # Convert the data to LIBSVM format and extract feature indices
    featindex = to_libsvm_encode(data_path, time_frac_dict)
    
    # Further processing can happen after this to prepare the data for model training

Processing /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130313.txt -> /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130313.csv
Completed processing /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130313.txt
Processing /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130311.txt -> /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130311.csv
Completed processing /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130311.txt
Processing /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130316.txt -> /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/i

In [11]:
import os
import csv

# Function to combine all CSV files into one
def combine_csv_files(datapath, output_file='train.bid.csv'):
    # List all CSV files generated from bid logs
    csv_files = [f for f in os.listdir(datapath) if f.startswith('bid') and f.endswith('.csv')]
    
    combined_csv_path = os.path.join(datapath, output_file)
    
    with open(combined_csv_path, 'w', newline='') as out_file:
        csv_writer = csv.writer(out_file)
        header_written = False
        
        for csv_file in csv_files:
            csv_file_path = os.path.join(datapath, csv_file)
            
            with open(csv_file_path, 'r') as in_file:
                csv_reader = csv.reader(in_file)
                
                for i, row in enumerate(csv_reader):
                    # Write header only once
                    if i == 0 and header_written:
                        continue
                    if i == 0:
                        header_written = True
                    csv_writer.writerow(row)
                    
            print(f"Combined {csv_file_path} into {combined_csv_path}")
    
    print(f"All CSV files combined into {combined_csv_path}")

if __name__ == '__main__':
    # Data path setup
    data_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st'
    
    # Step 1: Combine all the bid CSV files into a single train.bid.csv
    combine_csv_files(data_path)

    # Step 2: Time Frac Dictionary for 15-minute intervals
    time_frac_dict = create_time_frac_dict()

    # Step 3: Convert the data to LIBSVM format and extract feature indices
    featindex = to_libsvm_encode(data_path, time_frac_dict)
    
    # Further processing can happen after this to prepare the data for model training

Combined /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130316.csv into /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/train.bid.csv
Combined /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130317.csv into /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/train.bid.csv
Combined /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130312.csv into /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/train.bid.csv
Combined /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/bid.20130314.csv into /home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/tr

KeyError: 'weekday'

In [12]:
csv_file_path = '/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training1st/train.bid.csv'

# Load and print the first few rows of the data along with column names
data = pd.read_csv(csv_file_path, nrows=5)
print("Column names in the CSV file:", data.columns)
print(data.head())

Column names in the CSV file: Index(['7ccd6d80ef7b50127eaa453b8f06075a', '20130316003300692',
       '37a6259cc0c1dae299a7866489dff0bd',
       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727),gzip(gfe),gzip(gfe)',
       '124.193.57.*', '1', '1.1', '2', 'trqRTvdbjq17DqKbuKz',
       '240898007d3953fbdcb5c8b934942b6f', 'Unnamed: 10', '1938265360', '336',
       '280', '1.2', '0', '5', '02adb1d6bc7233c0735dbefe9bb85ecd', '300'],
      dtype='object')
   7ccd6d80ef7b50127eaa453b8f06075a  20130316003300692  \
0   4e44d8345b95902e92cdeefa40f5c81  20130316003300696   
1  2ed51eeeef2d6fbdda9f856743a0d801  20130316003300701   
2  def432610104f0492ce889114cb19761  20130316003300725   
3  848d5122077d69ab89fe9c5cdd8687f3  20130316003300731   
4  ddd326e4d0107aae9a5b0706e67f1116  20130316003300734   

   37a6259cc0c1dae299a7866489dff0bd  \
0  611a964c794e51980cc1ea370f201e05   
1  6462f28bf9ce7800d14a40815e446275   
2  bc2b62a048c37437cb45db5b211f4daa   
3  e7

In [14]:
import pandas as pd

# Load data
bidding_log = pd.read_csv('/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training2nd/bid.20130606.txt', sep='\t')
impression_log = pd.read_csv('/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training2nd/imp.20130606.txt', sep='\t')
click_log = pd.read_csv('/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training2nd/clk.20130606.txt', sep='\t')

# Display the first few rows of bidding data
print(bidding_log.head())

  bidding_log = pd.read_csv('/home/vladplyusnin/tftest/Deep-Learning-COPSCI764/Project/Data sets/ipinyou.contest.dataset/training2nd/bid.20130606.txt', sep='\t')


   b382c1c156dcbbd5b9317cb50f6a747b  20130606000104008  Vh16OwT6OQNUXbj  \
0   7b6195de0d14203f92001da653bf1de  20130606000104009  Vhkr1vpROHuhQWB   
1  2ea9fe21cf7350fcb5696d8cff0bbeaa  20130606000104012  VhKdLnuY3tlhXMa   
2  8a15b98c8f9e60d4f92aaab01acf52a4  20130606000104014  VhTVORqG36N6qMj   
3  faf17eac9cabf1be598f4e75f40d501d  20130606000104016  VhL01pk8OTkW3Mc   
4  c60989edb8618fb1ab70ae56824af7ee  20130606000104017              NaN   

  mozilla/4.0 (compatible; msie 6.0; windows nt 5.1; sv1; qqdownload 718)  \
0  Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...                        
1  mozilla/4.0 (compatible; msie 8.0; windows nt ...                        
2  mozilla/5.0 (windows nt 5.1) applewebkit/537.1...                        
3  mozilla/5.0 (windows nt 5.1) applewebkit/537.1...                        
4  mozilla/4.0 (compatible; msie 8.0; windows nt ...                        

   180.127.189.*   80   87  1 tFKETuqyMo1mjMp45SqfNX  \
0  113.119.105.*  216  217  2 