Skip to content

Commit

Permalink
feat(ai): ai 增加vnc, shell 到容器, ai提交作业优化 (#1202)
Browse files Browse the repository at this point in the history
# ai 提交作业优化
1. 支持多个挂载点
2. 镜像支持手动输入
3. 已分享的镜像,数据集,以及模型,选取私有时,取privatePath而不是分享后的路径
4. 将提交作业时ai相关的参数都放入extra_options里

# ai 新增以shell 的方式进入容器的功能
ai 新增进入训练中的作业的容器并执行 shell 操作的功能。该功能依赖于 k8s 的 api server,所以需要一份 kubectl
config 配置文件。

![image](https://github.com/PKUHPC/SCOW/assets/140392039/8c01af88-ffac-41fd-9600-1658a7b6c24d)


![image](https://github.com/PKUHPC/SCOW/assets/140392039/1e9b433d-b18c-4cc1-88c1-b667f70c2070)

# ai 增加 vnc
Ai模块新增vnc应用

---------

Co-authored-by: Miracle575 <longsijie@icode.pku.edu.cn>
  • Loading branch information
ZihanChen821 and Miracle575 committed Apr 23, 2024
1 parent a4d36e2 commit e312efb
Show file tree
Hide file tree
Showing 31 changed files with 1,326 additions and 163 deletions.
7 changes: 7 additions & 0 deletions .changeset/lemon-fans-peel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@scow/config": patch
"@scow/cli": patch
"@scow/ai": patch
---

AI 模块支持创建 vnc 类型应用
10 changes: 10 additions & 0 deletions .changeset/pretty-toes-greet.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
"@scow/scheduler-adapter-protos": patch
"@scow/test-adapter": patch
"@scow/config": patch
"@scow/cli": patch
"@scow/ai": patch
"@scow/docs": patch
---

ai 增加 vnc 功能,以 shell 方式进入容器功能和提交作业的优化
5 changes: 5 additions & 0 deletions .changeset/witty-camels-share.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@scow/ai": patch
---

ai 新增以 shell 的方式进入容器的功能
6 changes: 5 additions & 1 deletion apps/ai/assets/app/vnc_entry.sh
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
//TODO
#!/bin/bash

export PORT=$1
export HOST=$2
export SVCPORT=$3
2 changes: 1 addition & 1 deletion apps/ai/next.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ export default async () => {
// HACK setup ws proxy
setTimeout(() => {
const url = `http://localhost:${process.env.PORT || 3000}${join(BASE_PATH, "/api/setup")}`;
console.log("Calling setup url to initialize proxy and shell server", url);
console.log("Calling setup url to initialize proxy and job shell server", url);

fetch(url).then(async (res) => {
console.log("Call completed. Response: ", await res.text());
Expand Down
7 changes: 4 additions & 3 deletions apps/ai/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
"@ddadaal/tsgrpc-client": "0.17.7",
"@ddadaal/tsgrpc-common": "0.2.5",
"@grpc/grpc-js": "1.10.6",
"@kubernetes/client-node": "^0.20.0",
"@mikro-orm/cli": "6.1.12",
"@mikro-orm/core": "6.1.12",
"@mikro-orm/migrations": "6.1.12",
Expand All @@ -48,9 +49,9 @@
"@scow/lib-decimal": "workspace:*",
"@scow/lib-operation-log": "workspace:*",
"@scow/lib-scheduler-adapter": "workspace:*",
"@scow/lib-server": "workspace:*",
"@scow/lib-ssh": "workspace:*",
"@scow/lib-web": "workspace:*",
"@scow/lib-server": "workspace:*",
"@scow/scheduler-adapter-protos": "workspace:*",
"@scow/utils": "workspace:*",
"@scow/rich-error-model": "workspace:*",
Expand Down Expand Up @@ -87,8 +88,8 @@
"swagger-ui-react": "5.13.0",
"trpc-openapi": "1.2.0",
"ws": "8.16.0",
"xterm": "5.3.0",
"xterm-addon-fit": "0.8.0",
"@xterm/xterm": "5.5.0",
"@xterm/addon-fit": "0.10.0",
"zod": "3.22.4",
"shell-quote": "1.8.1",
"replace-in-file": "7.1.0"
Expand Down
26 changes: 12 additions & 14 deletions apps/ai/src/app/(auth)/dashboard/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

"use client";
import { join } from "path";
import { usePublicConfig } from "src/app/(auth)/context";
import { Head } from "src/utils/head";
import { trpc } from "src/utils/trpc";
import { styled } from "styled-components";

const Logo = styled.div`
Expand All @@ -25,24 +25,22 @@ const Logo = styled.div`

export default function Page() {

const { data } = trpc.config.publicConfig.useQuery();
const { publicConfig: { BASE_PATH } } = usePublicConfig();

return (
<div>
<Head title={"dashboard"} />
{
data ? (
<Logo>
<img
alt="logo"
src={join(data.BASE_PATH, "/api/logo?type=logo")}
style={{
objectFit: "contain",
maxWidth: "50%",
}}
/>
</Logo>
) : undefined
<Logo>
<img
alt="logo"
src={join(BASE_PATH, "/api/logo?type=logo")}
style={{
objectFit: "contain",
maxWidth: "50%",
}}
/>
</Logo>
}
</div>
);
Expand Down
94 changes: 94 additions & 0 deletions apps/ai/src/app/(auth)/jobShell/[clusterId]/[jobId]/page.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* Copyright (c) 2022 Peking University and Peking University Institute for Computing and Digital Economy
* SCOW is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/

"use client";

import "xterm/css/xterm.css";

import { Button, Space } from "antd";
import dynamic from "next/dynamic";
import { usePublicConfig } from "src/app/(auth)/context";
import { Head } from "src/utils/head";
import { styled } from "styled-components";

const Container = styled.div`
position: fixed;
left: 0;
top: 0;
height: 100%;
width: 100%;
z-index: 2000;
display: flex;
flex-direction: column;
`;

const Header = styled.div`
padding: 8px 16px;
display: flex;
justify-content: space-between;
background-color: #333;
h2 { color: white; margin: 0px; }
.ant-popover-content p {
margin: 0;
}
`;


const TerminalContainer = styled.div`
display: flex;
flex: 1;
height: 100%;
`;

const Black = styled.div`
height: 100%;
background-color: black;
`;

const JobShellComponent = dynamic(
() => import("src/components/shell/JobShell").then((x) => x.JobShell), {
ssr: false,
loading: Black,
});

export default function Page({ params }: {params: {clusterId: string, jobId: string}}) {

const { clusterId, jobId } = params;
const { publicConfig, user } = usePublicConfig();

const clusterName = publicConfig.CLUSTERS.find((x) => x.id === clusterId)?.name || clusterId;

return (
<Container>
<Head title={`${clusterId}的终端`} />
<Header>
<h2>
{`用户 ${user.identityId} 连接到集群 ${clusterName} 的作业 ${jobId}`}
</h2>
<Space wrap>
<Button onClick={() => window.location.reload()}>
{"刷新并重新连接"}
</Button>
</Space>
</Header>
<TerminalContainer>
<JobShellComponent
user={user}
cluster={clusterId}
jobId={jobId}
/>
</TerminalContainer>
</Container>
);
};
14 changes: 10 additions & 4 deletions apps/ai/src/app/(auth)/jobs/[clusterId]/AppSessionsTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import { ExclamationCircleOutlined } from "@ant-design/icons";
import { App, Button, Checkbox, Form, Input, Popconfirm, Space, Table, TableColumnsType, Tooltip } from "antd";
import Link from "next/link";
import { useRouter } from "next/navigation";
import { join } from "path";
import React, { useCallback, useEffect, useMemo, useState } from "react";
Expand Down Expand Up @@ -96,7 +97,7 @@ export const AppSessionsTable: React.FC<Props> = ({ cluster, status }) => {
{
title: "类型",
dataIndex: "jobType",
width: "10%",
width: "8%",
render: (_, record) => {
if (record.jobType === JobType.APP) {
return "应用";
Expand All @@ -107,19 +108,20 @@ export const AppSessionsTable: React.FC<Props> = ({ cluster, status }) => {
{
title: "应用",
dataIndex: "appId",
width: "8%",
render: (appId: string, record) => record.appName ?? appId,
sorter: (a, b) => (!a.submitTime || !b.submitTime) ? -1 : compareDateTime(a.submitTime, b.submitTime),
},
{
title: "提交时间",
dataIndex: "submitTime",
width: "15%",
width: "200px",
render: (_, record) => record.submitTime ? formatDateTime(record.submitTime) : "",
},
{
title: "状态",
dataIndex: "state",
width: "12%",
width: "120px",
render: (_, record) => (
record.reason ? (
<Tooltip title={record.reason}>
Expand All @@ -140,14 +142,15 @@ export const AppSessionsTable: React.FC<Props> = ({ cluster, status }) => {
},
...(unfinished ? [{
title: "剩余时间",
width: "100px",
dataIndex: "remainingTime",
},
] : []),
{
title: "操作",
key: "action",
fixed:"right",
width: "10%",
width: "350px",
render: (_, record) => (
<Space>
{
Expand All @@ -160,6 +163,9 @@ export const AppSessionsTable: React.FC<Props> = ({ cluster, status }) => {
refreshToken={connectivityRefreshToken}
/>
)}
<Link href={`/jobShell/${cluster.id}/${record.jobId}`} target="_blank">
{"进入容器"}
</Link>
<Popconfirm
title="确定结束这个任务吗"
onConfirm={
Expand Down
6 changes: 4 additions & 2 deletions apps/ai/src/app/(auth)/jobs/[clusterId]/ConnectToAppLink.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { useEffect } from "react";
import { DisabledA } from "src/components/DisabledA";
import { AppSession } from "src/server/trpc/route/jobs/apps";
import { trpc } from "src/utils/trpc";
import { openDesktop } from "src/utils/vnc";

import { usePublicConfig } from "../../context";

Expand All @@ -31,7 +32,7 @@ export interface Props {
export const ConnectTopAppLink: React.FC<Props> = ({
session, cluster, refreshToken,
}) => {
const { publicConfig: { BASE_PATH } } = usePublicConfig();
const { publicConfig: { BASE_PATH, NOVNC_CLIENT_URL } } = usePublicConfig();
const { message } = App.useApp();

const { data, refetch } = trpc.jobs.checkAppConnectivity.useQuery({ clusterId: cluster, jobId: session.jobId }, {
Expand Down Expand Up @@ -100,7 +101,8 @@ export const ConnectTopAppLink: React.FC<Props> = ({
}

} else {
// TODO: vnc app
const { host, port, password } = reply;
openDesktop(BASE_PATH, NOVNC_CLIENT_URL, cluster, host, port, password);
return;
}

Expand Down

0 comments on commit e312efb

Please sign in to comment.