In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## *Code for NLP files download*

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Get English stopwords
stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Code for generation of interaction btw Disease ,Chemical  & Gene

In [None]:
import pandas as pd

# # Load data from CSV files
# ayurveda_df = pd.read_csv("")
# chinese_df = pd.read_csv("gn_dh_chi.csv")
# ayu_ch_df = pd.read_csv("")
# chin_ch_df = pd.read_csv("ch_gn_chi.csv")

# Load data from CSV files
ayurveda_df = pd.read_csv("/content/drive/MyDrive/IP/AyurVeda_New/interaction_Gene_Disease.csv").sample(30)
chinese_df = pd.read_csv("/content/drive/MyDrive/IP/Traditional Chinese Medicine/interaction_Gene_Disease.csv").sample(50)
ayu_ch_df = pd.read_csv("/content/drive/MyDrive/IP/AyurVeda_New/interaction_chemical_gene.csv")
chin_ch_df = pd.read_csv("/content/drive/MyDrive/IP/Traditional Chinese Medicine/interaction_chemical_gene.csv").sample(50)
dataset =pd.read_csv("/content/drive/MyDrive/IP/dataset.csv")



# Initialize the stemmer
stemmer = PorterStemmer()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Get English stopwords
stop_words = set(stopwords.words('english'))

def stem_text(text):
    if isinstance(text, str):
        return ' '.join([stemmer.stem(word) for word in word_tokenize(text)])
    else:
        return text

def lemmatize_text(text):
    if isinstance(text, str):
        return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    else:
        return text

def clean_df(df):
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)  # Convert text to lowercase
    df = df.applymap(lemmatize_text)  # Apply lemmatization
    df = df.applymap(stem_text)  # Apply stemming
    df = df.drop_duplicates()  # Drop duplicates

    return df

# Apply lowercase and remove duplicates for all dataframes
ayurveda_df = clean_df(ayurveda_df)
chinese_df = clean_df(chinese_df)
ayu_ch_df = clean_df(ayu_ch_df)
chin_ch_df = clean_df(chin_ch_df)


def replace_disease_names(df, dataset):
    for index, row in df.iterrows():
        disease = row['Diseases']
        symptoms_matched = 0
        if disease in dataset['Disease'].values:
            symptom_row = dataset[dataset['Disease'] == disease].iloc[0]
            for col in df.columns:
                if col.startswith('Symptom'):
                    symptom = symptom_row[col]
                    if pd.notnull(symptom):
                        if row[col] == symptom:  # Check if the symptom matches
                            symptoms_matched += 1
                            if symptoms_matched > 4:
                                df.at[index, 'Diseases'] = disease
                                break

    return df


# Replace disease names with symptoms in ayurveda_df
ayurveda_df = replace_disease_names(ayurveda_df, dataset)

# Replace disease names with symptoms in chinese_df
chinese_df = replace_disease_names(chinese_df, dataset)


# Initialize the data list with an assumed strength of 1 for all interactions
data = []

# Generate interactions from Ayurvedic and Chinese data
genes_with_diseases = set()  # This will store genes associated with diseases
for _, row in ayurveda_df.iterrows():
    data.append(('Ayurvedic', row['Diseases'], 1))  # Group to disease
    data.append((row['Diseases'], row['Genes'], 1))  # Disease to gene
    genes_with_diseases.add(row['Genes'])

for _, row in chinese_df.iterrows():
    data.append(('Chinese', row['Diseases'], 1))  # Group to disease
    data.append((row['Diseases'], row['Genes'], 1))  # Disease to gene
    genes_with_diseases.add(row['Genes'])

# Add interactions from genes to chemicals
genes_with_chemicals = set()
for _, row in chin_ch_df.iterrows():
    data.append((row['Genes'], row['Chemicals'], 1))  # Gene to chemical
    genes_with_chemicals.add(row['Genes'])

for _, row in ayu_ch_df.iterrows():
    data.append((row['Genes'], row['Chemicals'], 1))  # Gene to chemical
    genes_with_chemicals.add(row['Genes'])

# Collect all unique genes from all data
all_genes = pd.concat([ayurveda_df['Genes'], chinese_df['Genes'], ayu_ch_df['Genes'], chin_ch_df['Genes']]).unique()

# Collect all unique genes from both disease and chemical data for Ayurvedic and Chinese
ayurvedic_genes = pd.concat([ayurveda_df['Genes'], ayu_ch_df['Genes']]).unique()
chinese_genes = pd.concat([chinese_df['Genes'], chin_ch_df['Genes']]).unique()

# Check each gene for missing disease associations and missing chemical associations
for gene in all_genes:
    in_ayurveda = gene in ayurvedic_genes
    in_chinese = gene in chinese_genes
    if in_ayurveda and (gene not in genes_with_diseases):
        #print(gene)
        #print("adasdasda")
        data.append(("no ayurvedic disease specified", gene, 1))
    elif in_chinese and  (gene not in genes_with_diseases):
        data.append(("no chinese disease specified", gene, 1))
        #print(gene)
        #print("bdasdasdaasdasas")
    elif  (gene not in genes_with_diseases):
         print(gene)
         #print("cdasdasdaasdasdsas")
    if gene not in genes_with_chemicals:
        data.append((gene, "no chemical specified", 1))
        #print("ddasdasdddddddddddda")

# Add connections for no disease found placeholders
data.append(("Ayurvedic", "no ayurvedic disease specified", 1))
data.append(("Chinese", "no chinese disease specified", 1))


# Convert the list to a DataFrame for easier CSV writing
interactions_df = pd.DataFrame(data, columns=['Source', 'Target', 'Value'])

# Write to CSV
interactions_df.to_csv('interactions.csv', index=False)


## Code to generate Graph from interactions on subset

In [None]:
import pandas as pd
import plotly.graph_objects as go
# Load interactions data from the CSV file
interactions_df = pd.read_csv('interactions.csv')

# Grouping the data by 'Source' and 'Target' and summing the 'Value'
interactions_df = interactions_df.groupby(['Source', 'Target'], as_index=False)['Value'].sum()

# Convert data into nodes and links
labels = []
sources = []
targets = []
values = []

label_lookup = {}
i = 0

# Process each row in the DataFrame to extract necessary details for the Sankey diagram
for index, row in interactions_df.iterrows():
    source, target, value = row['Source'], row['Target'], row['Value']
    if source not in label_lookup:
        label_lookup[source] = i
        labels.append(source)
        i += 1
    if target not in label_lookup:
        label_lookup[target] = i
        labels.append(target)
        i += 1
    sources.append(label_lookup[source])
    targets.append(label_lookup[target])
    values.append(value)

# Node and Link Colors (adjust if needed)
node_colors = ['#F27420', '#4994CE', '#FABC13', '#7FC241', '#D3D3D3', '#8A5988', '#449E9E', '#D3D3D3']
node_colors += ['#%06x' % (int(i * 1234567) % 0xFFFFFF) for i in range(len(labels) - len(node_colors))]

# Create the Sankey diagram with specified orientation and domain
fig = go.Figure(data=[go.Sankey(
    domain=dict(x=[0, 1], y=[0, 1]),
    orientation='h',
    valueformat=".0f",
    node=dict(
        pad=10,
        thickness=30,
        line=dict(color="black", width=0.5),
        label=labels,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color="rgba(211, 211, 211, 0.5)"
    )
)])

# Set the layout of the chart
fig.update_layout(
    title_text="Interactions between Ayurvedic / Chinese -> Diseases -> Genes -> Chemicals",
    title_font_size=16,
    font_size=12,
    font_color='darkblue',
    height=800  # Adjusted height to better fit the additional layer
)
fig.write_html('scatter_plot.html')
# Show the figure
fig.show()


## #Code For generation of interaction btw  Disease , Gene and Chemcial on whole data

In [None]:
import pandas as pd

# # Load data from CSV files
# ayurveda_df = pd.read_csv("")
# chinese_df = pd.read_csv("gn_dh_chi.csv")
# ayu_ch_df = pd.read_csv("")
# chin_ch_df = pd.read_csv("ch_gn_chi.csv")

# Load data from CSV files
ayurveda_df = pd.read_csv("/content/drive/MyDrive/IP/AyurVeda_New/interaction_Gene_Disease.csv")
chinese_df = pd.read_csv("/content/drive/MyDrive/IP/Traditional Chinese Medicine/interaction_Gene_Disease.csv")
ayu_ch_df = pd.read_csv("/content/drive/MyDrive/IP/AyurVeda_New/interaction_chemical_gene.csv")
chin_ch_df = pd.read_csv("/content/drive/MyDrive/IP/Traditional Chinese Medicine/interaction_chemical_gene.csv")
dataset =pd.read_csv("/content/drive/MyDrive/IP/dataset.csv")



# Initialize the stemmer
stemmer = PorterStemmer()

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Get English stopwords
stop_words = set(stopwords.words('english'))

def stem_text(text):
    if isinstance(text, str):
        return ' '.join([stemmer.stem(word) for word in word_tokenize(text)])
    else:
        return text

def lemmatize_text(text):
    if isinstance(text, str):
        return ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    else:
        return text

def clean_df(df):
    df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)  # Convert text to lowercase
    df = df.applymap(lemmatize_text)  # Apply lemmatization
    df = df.applymap(stem_text)  # Apply stemming
    df = df.drop_duplicates()  # Drop duplicates

    return df

# Apply lowercase and remove duplicates for all dataframes
ayurveda_df = clean_df(ayurveda_df)
chinese_df = clean_df(chinese_df)
ayu_ch_df = clean_df(ayu_ch_df)
chin_ch_df = clean_df(chin_ch_df)


def replace_disease_names(df, dataset):
    for index, row in df.iterrows():
        disease = row['Diseases']
        symptoms_matched = 0
        if disease in dataset['Disease'].values:
            symptom_row = dataset[dataset['Disease'] == disease].iloc[0]
            for col in df.columns:
                if col.startswith('Symptom'):
                    symptom = symptom_row[col]
                    if pd.notnull(symptom):
                        if row[col] == symptom:  # Check if the symptom matches
                            symptoms_matched += 1
                            if symptoms_matched > 4:
                                df.at[index, 'Diseases'] = disease
                                break

    return df


# Replace disease names with symptoms in ayurveda_df
ayurveda_df = replace_disease_names(ayurveda_df, dataset)

# Replace disease names with symptoms in chinese_df
chinese_df = replace_disease_names(chinese_df, dataset)


# Initialize the data list with an assumed strength of 1 for all interactions
data = []

# Generate interactions from Ayurvedic and Chinese data
genes_with_diseases = set()  # This will store genes associated with diseases
for _, row in ayurveda_df.iterrows():
    data.append(('Ayurvedic', row['Diseases'], 1))  # Group to disease
    data.append((row['Diseases'], row['Genes'], 1))  # Disease to gene
    genes_with_diseases.add(row['Genes'])

for _, row in chinese_df.iterrows():
    data.append(('Chinese', row['Diseases'], 1))  # Group to disease
    data.append((row['Diseases'], row['Genes'], 1))  # Disease to gene
    genes_with_diseases.add(row['Genes'])

# Add interactions from genes to chemicals
genes_with_chemicals = set()
for _, row in chin_ch_df.iterrows():
    data.append((row['Genes'], row['Chemicals'], 1))  # Gene to chemical
    genes_with_chemicals.add(row['Genes'])

for _, row in ayu_ch_df.iterrows():
    data.append((row['Genes'], row['Chemicals'], 1))  # Gene to chemical
    genes_with_chemicals.add(row['Genes'])

# Collect all unique genes from all data
all_genes = pd.concat([ayurveda_df['Genes'], chinese_df['Genes'], ayu_ch_df['Genes'], chin_ch_df['Genes']]).unique()

# Collect all unique genes from both disease and chemical data for Ayurvedic and Chinese
ayurvedic_genes = pd.concat([ayurveda_df['Genes'], ayu_ch_df['Genes']]).unique()
chinese_genes = pd.concat([chinese_df['Genes'], chin_ch_df['Genes']]).unique()

# Check each gene for missing disease associations and missing chemical associations
for gene in all_genes:
    in_ayurveda = gene in ayurvedic_genes
    in_chinese = gene in chinese_genes
    if in_ayurveda and (gene not in genes_with_diseases):
        #print(gene)
        #print("adasdasda")
        data.append(("no ayurvedic disease specified", gene, 1))
    elif in_chinese and  (gene not in genes_with_diseases):
        data.append(("no chinese disease specified", gene, 1))
        #print(gene)
        #print("bdasdasdaasdasas")
    elif  (gene not in genes_with_diseases):
         print(gene)
         #print("cdasdasdaasdasdsas")
    if gene not in genes_with_chemicals:
        data.append((gene, "no chemical specified", 1))
        #print("ddasdasdddddddddddda")

# Add connections for no disease found placeholders
data.append(("Ayurvedic", "no ayurvedic disease specified", 1))
data.append(("Chinese", "no chinese disease specified", 1))


# Convert the list to a DataFrame for easier CSV writing
interactions_df = pd.DataFrame(data, columns=['Source', 'Target', 'Value'])

# Write to CSV
interactions_df.to_csv('interactions.csv', index=False)


# Code For generation of Graph for interaction on whole data

In [None]:
import pandas as pd
import plotly.graph_objects as go
# Load interactions data from the CSV file
interactions_df = pd.read_csv('interactions.csv')

# Grouping the data by 'Source' and 'Target' and summing the 'Value'
interactions_df = interactions_df.groupby(['Source', 'Target'], as_index=False)['Value'].sum()

# Convert data into nodes and links
labels = []
sources = []
targets = []
values = []

label_lookup = {}
i = 0

# Process each row in the DataFrame to extract necessary details for the Sankey diagram
for index, row in interactions_df.iterrows():
    source, target, value = row['Source'], row['Target'], row['Value']
    if source not in label_lookup:
        label_lookup[source] = i
        labels.append(source)
        i += 1
    if target not in label_lookup:
        label_lookup[target] = i
        labels.append(target)
        i += 1
    sources.append(label_lookup[source])
    targets.append(label_lookup[target])
    values.append(value)

# Node and Link Colors (adjust if needed)
node_colors = ['#F27420', '#4994CE', '#FABC13', '#7FC241', '#D3D3D3', '#8A5988', '#449E9E', '#D3D3D3']
node_colors += ['#%06x' % (int(i * 1234567) % 0xFFFFFF) for i in range(len(labels) - len(node_colors))]

# Create the Sankey diagram with specified orientation and domain
fig = go.Figure(data=[go.Sankey(
    domain=dict(x=[0, 1], y=[0, 1]),
    orientation='h',
    valueformat=".0f",
    node=dict(
        pad=10,
        thickness=30,
        line=dict(color="black", width=0.5),
        label=labels,
        color=node_colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        color="rgba(211, 211, 211, 0.5)"
    )
)])

# Set the layout of the chart
fig.update_layout(
    title_text="Interactions between Ayurvedic / Chinese -> Diseases -> Genes -> Chemicals",
    title_font_size=16,
    font_size=12,
    font_color='darkblue',
    height=800  # Adjusted height to better fit the additional layer
)
fig.write_html('scatter_plot_lg.html')
# Show the figure
fig.show()
