In [None]:
import dask.dataframe as dd
import pandas as pd

# Load the InterPROscan output file into a Dask DataFrame
filename = '/data/dataprocessing/interproscan/all_bacilli.tsv'
df = dd.read_csv(filename, sep='\t', dtype=str, header=None, names=["0", "1", "2","3","4","5","6","7","8","9","10","11","12","13","14"])

In [None]:
df.head()

In [None]:
print(df.shape)
print(df.info(),"\n",75*"_")
print(df.describe(),"\n",75*"_")
print("nulls : ",df.isnull().sum().sum())

In [None]:
# Count unique protein annotations
unique_annotations = df['11'].nunique().compute(num_workers=16)

print("Distinct protein annotations:", unique_annotations)


In [None]:
# Average number of annotations per protein
mean_annotations = df.groupby('1').size().mean().compute(num_workers=16)

print("Average annotations per protein:", mean_annotations)


In [None]:
# Split '|'
go_terms = df['0'].str.split('|').explode()

# Count most common GO term
most_common_go_term = go_terms.value_counts().nlargest(1).compute().index[0]
print("Most common GO Term:", most_common_go_term)


In [None]:
# Size of InterPRO features
df['FeatureSize'] = df['7'].astype(int) - df['6'].astype(int)

# Average size of InterPRO features
average_size_feature = df['FeatureSize'].mean().compute(num_workers=16)

# Rounded with two decimal places
average_size_feature_formatted = "{:.2f}".format(average_size_feature)
print(f"Average size of InterPRO feature: {average_size_feature_formatted}")


In [None]:
# Top 10 most common InterPRO features
top_10_interpro_features = df['1'].value_counts().nlargest(10).compute(num_workers=16)

print("Top 10 most common InterPRO features:")

# Iterate over the top 10 features
for index, feature_count in top_10_interpro_features.iteritems():
    print(f"Feature: {index}, Count: {feature_count}")


In [None]:
# retrieve the protein size from column '2'
protein_size = df['2'].astype(int)

# specify the similarity threshold (90-100% similarity)
similar_size_threshold = 0.9

# pick InterPRO features with similar size proteins
similar_size_features = df[abs(df['FeatureSize'] - protein_size) / protein_size <= similar_size_threshold]

# Find the top 10 most common InterPRO attributes within comparable-sized characteristics.
top_10_similar_size_features = similar_size_features['1'].value_counts().nlargest(10).compute(num_workers=16)

print("Top 10 most common InterPRO features with similar size:")

# Iterate over the top 10 similar size features and print each feature with its count
for index, feature_count in top_10_similar_size_features.iteritems():
    print(f"Feature: {index}, Count: {feature_count}")


In [None]:
from collections import Counter

# Concatenate the textual annotations from columns '3', '4', '5', '11', and '12'
text_annotations = df['3'] + ' ' + df['4'] + ' ' + df['5'] + df['11'] + df['12']

# Convert the text annotations to lowercase, remove special characters, and normalize whitespaces
text_annotations = text_annotations.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '').str.replace(r'\s+', ' ')

# frequency of each word
word_counts = Counter(word for annotation in text_annotations for word in annotation.split())

# top 10 most common words
top_10_words = word_counts.most_common(10)

for word, count in top_10_words:
    print(word, count)


In [None]:
# the top 10 least common word found in that annotation

top_10_least_common_words = word_counts.most_common()[:-11:-1]

for word, count in top_10_least_common_words:
    print(word, count)

In [None]:
from collections import Counter

# Select InterPRO features that are almost the same size as the protein itself
protein_size = df['2'].astype(int)
similar_size_threshold = 0.9  # 90-100% similarity
similar_size_features = df[abs(df['FeatureSize'] - protein_size) / protein_size <= similar_size_threshold]

# Get the textual annotation columns for the selected features
text_annotations = similar_size_features['3'] + ' ' + similar_size_features['4'] + ' ' + similar_size_features['5'] + similar_size_features['11'] + similar_size_features['12']
text_annotations = text_annotations.str.lower().str.replace(r'[^a-zA-Z0-9\s]', '').str.replace(r'\s+', ' ')

# frequency of each word
word_counts = Counter(word for annotation in text_annotations for word in annotation.split())

# top 10 most common words
top_10_words = word_counts.most_common(10)

for word, count in top_10_words:
    print(word, count)


In [None]:
# coefficient of correlation between protein size and number of features
coefficient_of_correlation = df['2'].astype(int).corr(df['7'].astype(int) - df['6'].astype(int))
coefficient_of_correlation_result = coefficient_of_correlation.compute(num_workers=16)
print("Coefficient of correlation:", coefficient_of_correlation_result)
